List of usage examples for org.apache.hadoop.fs FSDataOutputStream getPos
public long getPos()
From source file:org.apache.tajo.storage.thirdparty.parquet.ParquetFileWriter.java
License:Apache License
private static void serializeFooter(ParquetMetadata footer, FSDataOutputStream out) throws IOException { long footerIndex = out.getPos(); parquet.format.FileMetaData parquetMetadata = new ParquetMetadataConverter() .toParquetMetadata(CURRENT_VERSION, footer); writeFileMetaData(parquetMetadata, out); if (DEBUG)/*from w ww .ja va2 s.com*/ LOG.debug(out.getPos() + ": footer length = " + (out.getPos() - footerIndex)); BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex)); out.write(MAGIC); }
From source file:org.apache.tajo.worker.TestFetcherWithTajoPullServer.java
License:Apache License
@Test public void testGetRangeShuffle() throws IOException { Random rnd = new Random(); QueryId queryId = QueryIdFactory.NULL_QUERY_ID; String sid = "1"; String partId = "1"; String taskId = "1"; String attemptId = "0"; Path queryBaseDir = PullServerUtil.getBaseOutputDir(queryId.toString(), sid); Path outDir = StorageUtil.concatPath(queryBaseDir, taskId + "_" + attemptId, "output"); Path dataPath = StorageUtil.concatPath(outDir, "output"); Path indexPath = StorageUtil.concatPath(outDir, "index"); List<String> strings = new ArrayList<>(100); for (int i = 0; i < 100; i++) { strings.add("" + rnd.nextInt()); }/*from w w w . j ava2s . c o m*/ Collections.sort(strings); Path inputPath = new Path(INPUT_DIR, dataPath); FileSystem fs = FileSystem.getLocal(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } final FSDataOutputStream stream = fs.create(inputPath, true); BSTIndex index = new BSTIndex(conf); Schema schema = SchemaBuilder.builder().addAll(new Column[] { new Column("rnd", Type.TEXT) }).build(); SortSpec[] sortSpecs = new SortSpec[] { new SortSpec(schema.getColumn(0)) }; BSTIndexWriter writer = index.getIndexWriter(new Path(INPUT_DIR, indexPath), BSTIndex.TWO_LEVEL_INDEX, schema, new BaseTupleComparator(schema, sortSpecs), true); writer.init(); for (String t : strings) { writer.write(new VTuple(new Datum[] { DatumFactory.createText(t) }), stream.getPos()); stream.write(t.getBytes()); } stream.flush(); writer.flush(); stream.close(); writer.close(); RangeParam rangeParam = new RangeParam( new TupleRange(sortSpecs, new VTuple(new Datum[] { DatumFactory.createText(strings.get(0)) }), new VTuple(new Datum[] { DatumFactory.createText(strings.get(strings.size() - 1)) })), true, RowStoreUtil.createEncoder(schema)); PullServerRequestURIBuilder builder = new PullServerRequestURIBuilder("127.0.0.1", pullserverPort, maxUrlLength); builder.setRequestType(PullServerConstants.CHUNK_REQUEST_PARAM_STRING).setQueryId(queryId.toString()) .setEbId(sid).setPartId(partId).setShuffleType(PullServerConstants.RANGE_SHUFFLE_PARAM_STRING) .setTaskIds(Lists.newArrayList(Integer.parseInt(taskId))) .setAttemptIds(Lists.newArrayList(Integer.parseInt(attemptId))) .setStartKeyBase64(new String(Base64.encodeBase64(rangeParam.getStart()))) .setEndKeyBase64(new String(Base64.encodeBase64(rangeParam.getEnd()))).setLastInclude(true); URI uri = builder.build(true).get(0); File data = new File(OUTPUT_DIR + "data"); final AbstractFetcher fetcher = getFetcher(uri, data); FileChunk chunk = fetcher.get().get(0); assertNotNull(chunk); assertNotNull(chunk.getFile()); FileStatus inStatus = fs.getFileStatus(inputPath); FileStatus outStatus = fs.getFileStatus(new Path(chunk.getFile().getAbsolutePath())); assertEquals(inStatus.getLen(), outStatus.getLen()); assertEquals(FetcherState.FETCH_DATA_FINISHED, fetcher.getState()); }
From source file:org.apache.tez.engine.common.sort.impl.dflt.DefaultSorter.java
License:Apache License
protected void spill(int mstart, int mend) throws IOException, InterruptedException { //approximate the length of the output file to be the length of the //buffer + header lengths for the partitions final long size = (bufend >= bufstart ? bufend - bufstart : (bufvoid - bufend) + bufstart) + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try {//from www .ja v a 2 s. c o m // create spill file final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); int spindex = mstart; final InMemValBytes value = createInMemValBytes(); for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); writer = new Writer(job, out, keyClass, valClass, codec, spilledRecordsCounter); if (combineProcessor == null) { // spill directly DataInputBuffer key = new DataInputBuffer(); while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) { final int kvoff = offsetFor(spindex); key.reset(kvbuffer, kvmeta.get(kvoff + KEYSTART), (kvmeta.get(kvoff + VALSTART) - kvmeta.get(kvoff + KEYSTART))); getVBytesForOffset(kvoff, value); writer.append(key, value); ++spindex; } } else { int spstart = spindex; while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) { ++spindex; } // Note: we would like to avoid the combiner if we've fewer // than some threshold of records for a partition if (spstart != spindex) { TezRawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex); if (LOG.isDebugEnabled()) { LOG.debug("Running combine processor"); } runCombineProcessor(kvIter, writer); } } // close the writer writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } }
From source file:org.apache.tez.engine.common.sort.impl.dflt.DefaultSorter.java
License:Apache License
/** * Handles the degenerate case where serialization fails to fit in * the in-memory buffer, so we must spill the record from collect * directly to a spill file. Consider this "losing". *//*from w w w . j a v a 2 s. c o m*/ private void spillSingleRecord(final Object key, final Object value, int partition) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); // we don't run the combiner for a single record for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer(job, out, keyClass, valClass, codec, spilledRecordsCounter); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); // record offsets TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } ++numSpills; } finally { if (out != null) out.close(); } }
From source file:org.apache.tez.engine.common.sort.impl.dflt.DefaultSorter.java
License:Apache License
private void mergeParts() throws IOException, InterruptedException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final TezTaskAttemptID mapId = task.getTaskAttemptId(); for (int i = 0; i < numSpills; i++) { filename[i] = mapOutputFile.getSpillFile(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); }/*from w ww.j a va2s . co m*/ if (numSpills == 1) { //the spill is the final output sameVolRename(filename[0], mapOutputFile.getOutputFileForWriteInVolume(filename[0])); if (indexCacheList.size() == 0) { sameVolRename(mapOutputFile.getSpillIndexFile(0), mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0])); } else { indexCacheList.get(0).writeToFile(mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]), job); } sortPhase.complete(); return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = mapOutputFile.getSpillIndexFile(i); indexCacheList.add(new TezSpillRecord(indexFileName, job)); } //make correction in the length to include the sequence file header //lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; Path finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); //The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { //create dummy files TezSpillRecord sr = new TezSpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer writer = new Writer(job, finalOut, keyClass, valClass, codec, null); writer.close(); TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, job); } finally { finalOut.close(); } sortPhase.complete(); return; } else { sortPhase.addPhases(partitions); // Divide sort phase into sub-phases TezMerger.considerFinalMergeForProgress(); final TezSpillRecord spillRec = new TezSpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { //create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment(job, rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug("MapId=" + mapId + " Reducer=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")"); } } int mergeFactor = job.getInt(TezJobConfig.TEZ_ENGINE_IO_SORT_FACTOR, TezJobConfig.DEFAULT_TEZ_ENGINE_IO_SORT_FACTOR); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; //merge TezRawKeyValueIterator kvIter = TezMerger.merge(job, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(mapId.toString()), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(job), runningTaskContext.getTaskReporter(), sortSegments, null, spilledRecordsCounter, sortPhase.phase()); //write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(job, finalOut, keyClass, valClass, codec, spilledRecordsCounter); if (combineProcessor == null || numSpills < minSpillsForCombine) { TezMerger.writeFile(kvIter, writer, runningTaskContext.getTaskReporter(), job); } else { runCombineProcessor(kvIter, writer); } writer.close(); sortPhase.startNextPhase(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, job); finalOut.close(); for (int i = 0; i < numSpills; i++) { rfs.delete(filename[i], true); } } }
From source file:org.apache.tez.engine.common.sort.impl.PipelinedSorter.java
License:Apache License
public void spill() throws IOException { // create spill file final long size = largeBuffer.capacity() + (partitions * APPROX_HEADER_LENGTH); final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); FSDataOutputStream out = rfs.create(filename, true, 4096); try {/* www. j a v a 2 s.co m*/ merger.ready(); // wait for all the future results from sort threads LOG.info("Spilling to " + filename.toString()); for (int i = 0; i < partitions; ++i) { TezRawKeyValueIterator kvIter = merger.filter(i); //write merged output to disk long segmentStart = out.getPos(); Writer writer = new Writer(job, out, keyClass, valClass, codec, spilledRecordsCounter); writer.setRLE(merger.needsRLE()); if (combineProcessor == null) { while (kvIter.next()) { writer.append(kvIter.getKey(), kvIter.getValue()); } } else { runCombineProcessor(kvIter, writer); } //close writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); } Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); // TODO: cache spillRec.writeToFile(indexFilename, job); ++numSpills; } catch (InterruptedException ie) { // TODO:the combiner has been interrupted } finally { out.close(); } }
From source file:org.apache.tez.engine.common.sort.impl.PipelinedSorter.java
License:Apache License
@Override public void flush() throws IOException, InterruptedException { final TezTaskAttemptID mapId = task.getTaskAttemptId(); Path finalOutputFile = mapOutputFile.getOutputFileForWrite(0); //TODO Path finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(0); //TODO LOG.info("Starting flush of map output"); span.end();// ww w . j ava 2 s . co m merger.add(span.sort(sorter, comparator)); spill(); sortmaster.shutdown(); largeBuffer = null; if (numSpills == 1) { // someday be able to pass this directly to shuffle // without writing to disk final Path filename = mapOutputFile.getSpillFile(0); Path indexFilename = mapOutputFile.getSpillIndexFile(0); sameVolRename(filename, finalOutputFile); sameVolRename(indexFilename, finalIndexFile); return; } //The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); sortPhase.addPhases(partitions); // Divide sort phase into sub-phases TezMerger.considerFinalMergeForProgress(); final TezSpillRecord spillRec = new TezSpillRecord(partitions); final ArrayList<TezSpillRecord> indexCacheList = new ArrayList<TezSpillRecord>(); for (int i = 0; i < numSpills; i++) { // TODO: build this cache before Path indexFilename = mapOutputFile.getSpillIndexFile(i); TezSpillRecord spillIndex = new TezSpillRecord(indexFilename, job); indexCacheList.add(spillIndex); } for (int parts = 0; parts < partitions; parts++) { //create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { Path spillFilename = mapOutputFile.getSpillFile(i); TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment(job, rfs, spillFilename, indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, true); segmentList.add(i, s); } int mergeFactor = job.getInt(TezJobConfig.TEZ_ENGINE_IO_SORT_FACTOR, TezJobConfig.DEFAULT_TEZ_ENGINE_IO_SORT_FACTOR); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; //merge @SuppressWarnings("unchecked") TezRawKeyValueIterator kvIter = TezMerger.merge(job, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(mapId.toString()), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(job), runningTaskContext.getTaskReporter(), sortSegments, null, spilledRecordsCounter, sortPhase.phase()); //write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(job, finalOut, keyClass, valClass, codec, spilledRecordsCounter); writer.setRLE(merger.needsRLE()); if (combineProcessor == null || numSpills < minSpillsForCombine) { TezMerger.writeFile(kvIter, writer, runningTaskContext.getTaskReporter(), job); } else { runCombineProcessor(kvIter, writer); } //close writer.close(); sortPhase.startNextPhase(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, job); finalOut.close(); for (int i = 0; i < numSpills; i++) { Path indexFilename = mapOutputFile.getSpillIndexFile(i); Path spillFilename = mapOutputFile.getSpillFile(i); rfs.delete(indexFilename, true); rfs.delete(spillFilename, true); } }
From source file:org.apache.tez.mapreduce.processor.MapUtils.java
License:Apache License
private static void writeSplitFiles(FileSystem fs, JobConf conf, InputSplit split) throws IOException { Path jobSplitFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR_DEFAULT), MRJobConfig.JOB_SPLIT); LOG.info("Writing split to: " + jobSplitFile); FSDataOutputStream out = FileSystem.create(fs, jobSplitFile, new FsPermission(JOB_FILE_PERMISSION)); long offset = out.getPos(); Text.writeString(out, split.getClass().getName()); split.write(out);// w w w .jav a 2 s.c o m out.close(); String[] locations = split.getLocations(); SplitMetaInfo info = null; info = new JobSplit.SplitMetaInfo(locations, offset, split.getLength()); Path jobSplitMetaInfoFile = new Path(conf.get(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR), MRJobConfig.JOB_SPLIT_METAINFO); FSDataOutputStream outMeta = FileSystem.create(fs, jobSplitMetaInfoFile, new FsPermission(JOB_FILE_PERMISSION)); outMeta.write(SplitMetaInfoReaderTez.META_SPLIT_FILE_HEADER); WritableUtils.writeVInt(outMeta, SplitMetaInfoReaderTez.META_SPLIT_VERSION); WritableUtils.writeVInt(outMeta, 1); // Only 1 split meta info being written info.write(outMeta); outMeta.close(); }
From source file:org.apache.tez.runtime.library.common.shuffle.orderedgrouped.TestMergeManager.java
License:Apache License
private SrcFileInfo createFile(Configuration conf, FileSystem fs, Path path, int numPartitions, int numKeysPerPartition, int startKey) throws IOException { FSDataOutputStream outStream = fs.create(path); int currentKey = startKey; SrcFileInfo srcFileInfo = new SrcFileInfo(); srcFileInfo.indexedRecords = new TezIndexRecord[numPartitions]; srcFileInfo.path = path;//from ww w . j av a 2 s . com for (int i = 0; i < numPartitions; i++) { long pos = outStream.getPos(); IFile.Writer writer = new IFile.Writer(conf, outStream, IntWritable.class, IntWritable.class, null, null, null); for (int j = 0; j < numKeysPerPartition; j++) { writer.append(new IntWritable(currentKey), new IntWritable(currentKey)); currentKey++; } writer.close(); srcFileInfo.indexedRecords[i] = new TezIndexRecord(pos, writer.getRawLength(), writer.getCompressedLength()); } outStream.close(); return srcFileInfo; }
From source file:org.apache.tez.runtime.library.common.sort.impl.dflt.DefaultSorter.java
License:Apache License
protected void spill(int mstart, int mend) throws IOException, InterruptedException { //approximate the length of the output file to be the length of the //buffer + header lengths for the partitions final long size = (bufend >= bufstart ? bufend - bufstart : (bufvoid - bufend) + bufstart) + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try {/*from w w w . j a va 2 s. com*/ // create spill file final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); spillFilePaths.put(numSpills, filename); out = rfs.create(filename); int spindex = mstart; final InMemValBytes value = createInMemValBytes(); boolean rle = isRLENeeded(); for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); writer = new Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null, rle); if (combiner == null) { // spill directly DataInputBuffer key = new DataInputBuffer(); while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) { final int kvoff = offsetFor(spindex); int keystart = kvmeta.get(kvoff + KEYSTART); int valstart = kvmeta.get(kvoff + VALSTART); key.reset(kvbuffer, keystart, valstart - keystart); getVBytesForOffset(kvoff, value); writer.append(key, value); ++spindex; } } else { int spstart = spindex; while (spindex < mend && kvmeta.get(offsetFor(spindex) + PARTITION) == i) { ++spindex; } // Note: we would like to avoid the combiner if we've fewer // than some threshold of records for a partition if (spstart != spindex) { TezRawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex); if (LOG.isDebugEnabled()) { LOG.debug("Running combine processor"); } runCombineProcessor(kvIter, writer); } } // close the writer writer.close(); if (numSpills > 0) { additionalSpillBytesWritten.increment(writer.getCompressedLength()); numAdditionalSpills.increment(1); // Reset the value will be set during the final merge. outputBytesWithOverheadCounter.setValue(0); } else { // Set this up for the first write only. Subsequent ones will be handled in the final merge. outputBytesWithOverheadCounter.increment(writer.getRawLength()); } // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillFileIndexPaths.put(numSpills, indexFilename); spillRec.writeToFile(indexFilename, conf); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } }