List of usage examples for org.apache.hadoop.fs FSDataOutputStream getPos
public long getPos()
From source file:org.apache.tez.runtime.library.common.sort.impl.dflt.DefaultSorter.java
License:Apache License
/** * Handles the degenerate case where serialization fails to fit in * the in-memory buffer, so we must spill the record from collect * directly to a spill file. Consider this "losing". *//*from w ww . j a v a 2 s .c o m*/ private void spillSingleRecord(final Object key, final Object value, int partition) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); spillFilePaths.put(numSpills, filename); out = rfs.create(filename); // we don't run the combiner for a single record for (int i = 0; i < partitions; ++i) { IFile.Writer writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); if (numSpills > 0) { additionalSpillBytesWritten.increment(writer.getCompressedLength()); numAdditionalSpills.increment(1); outputBytesWithOverheadCounter.setValue(0); } else { // Set this up for the first write only. Subsequent ones will be handled in the final merge. outputBytesWithOverheadCounter.increment(writer.getRawLength()); } // record offsets TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillFileIndexPaths.put(numSpills, indexFilename); spillRec.writeToFile(indexFilename, conf); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } ++numSpills; } finally { if (out != null) out.close(); } }
From source file:org.apache.tez.runtime.library.common.sort.impl.dflt.DefaultSorter.java
License:Apache License
private void mergeParts() throws IOException { // get the approximate size of the final output/index files long finalOutFileSize = 0; long finalIndexFileSize = 0; final Path[] filename = new Path[numSpills]; final String taskIdentifier = outputContext.getUniqueIdentifier(); for (int i = 0; i < numSpills; i++) { filename[i] = spillFilePaths.get(i); finalOutFileSize += rfs.getFileStatus(filename[i]).getLen(); }/*from ww w . j a va 2s . c om*/ if (numSpills == 1) { //the spill is the final output finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename[0]); finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]); sameVolRename(filename[0], finalOutputFile); if (indexCacheList.size() == 0) { sameVolRename(spillFileIndexPaths.get(0), finalIndexFile); } else { indexCacheList.get(0).writeToFile(finalIndexFile, conf); } return; } // read in paged indices for (int i = indexCacheList.size(); i < numSpills; ++i) { Path indexFileName = spillFileIndexPaths.get(i); indexCacheList.add(new TezSpillRecord(indexFileName, conf)); } //make correction in the length to include the sequence file header //lengths for each partition finalOutFileSize += partitions * APPROX_HEADER_LENGTH; finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH; finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize); finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize); //The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); if (numSpills == 0) { // TODO Change event generation to say there is no data rather than generating a dummy file //create dummy files TezSpillRecord sr = new TezSpillRecord(partitions); try { for (int i = 0; i < partitions; i++) { long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null); writer.close(); TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); // Covers the case of multiple spills. outputBytesWithOverheadCounter.increment(writer.getRawLength()); sr.putIndex(rec, i); } sr.writeToFile(finalIndexFile, conf); } finally { finalOut.close(); } return; } else { final TezSpillRecord spillRec = new TezSpillRecord(partitions); for (int parts = 0; parts < partitions; parts++) { //create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment(rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true); segmentList.add(i, s); if (LOG.isDebugEnabled()) { LOG.debug("TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")"); } } int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; //merge TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null); // Not using any Progress in TezMerger. Should just work. //write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null); if (combiner == null || numSpills < minSpillsForCombine) { TezMerger.writeFile(kvIter, writer, nullProgressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT); } else { runCombineProcessor(kvIter, writer); } writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, conf); finalOut.close(); for (int i = 0; i < numSpills; i++) { rfs.delete(filename[i], true); } } }
From source file:org.apache.tez.runtime.library.common.sort.impl.PipelinedSorter.java
License:Apache License
public void spill() throws IOException { // create spill file final long size = capacity + +(partitions * APPROX_HEADER_LENGTH); final TezSpillRecord spillRec = new TezSpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); spillFilePaths.put(numSpills, filename); FSDataOutputStream out = rfs.create(filename, true, 4096); try {//from w w w . j av a2 s . c o m merger.ready(); // wait for all the future results from sort threads LOG.info("Spilling to " + filename.toString()); for (int i = 0; i < partitions; ++i) { TezRawKeyValueIterator kvIter = merger.filter(i); //write merged output to disk long segmentStart = out.getPos(); Writer writer = new Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null, merger.needsRLE()); if (combiner == null) { while (kvIter.next()) { writer.append(kvIter.getKey(), kvIter.getValue()); } } else { runCombineProcessor(kvIter, writer); } //close writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, i); } Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillFileIndexPaths.put(numSpills, indexFilename); // TODO: cache spillRec.writeToFile(indexFilename, conf); ++numSpills; } catch (InterruptedException ie) { // TODO:the combiner has been interrupted } finally { out.close(); } }
From source file:org.apache.tez.runtime.library.common.sort.impl.PipelinedSorter.java
License:Apache License
@Override public void flush() throws IOException { final String uniqueIdentifier = outputContext.getUniqueIdentifier(); finalOutputFile = mapOutputFile.getOutputFileForWrite(0); //TODO finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(0); //TODO LOG.info("Starting flush of map output"); span.end();//from w ww . ja v a2s. c o m merger.add(span.sort(sorter)); spill(); sortmaster.shutdown(); //safe to clean up bufferList.clear(); numAdditionalSpills.increment(numSpills - 1); if (numSpills == 1) { // someday be able to pass this directly to shuffle // without writing to disk final Path filename = spillFilePaths.get(0); final Path indexFilename = spillFileIndexPaths.get(0); finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename); finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(indexFilename); sameVolRename(filename, finalOutputFile); sameVolRename(indexFilename, finalIndexFile); return; } //The output stream for the final single output file FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); final TezSpillRecord spillRec = new TezSpillRecord(partitions); for (int i = 0; i < numSpills; i++) { // TODO: build this cache before Path indexFilename = spillFileIndexPaths.get(i); TezSpillRecord spillIndex = new TezSpillRecord(indexFilename, conf); indexCacheList.add(spillIndex); } for (int parts = 0; parts < partitions; parts++) { //create the segments to be merged List<Segment> segmentList = new ArrayList<Segment>(numSpills); for (int i = 0; i < numSpills; i++) { Path spillFilename = spillFilePaths.get(i); TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts); Segment s = new Segment(rfs, spillFilename, indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true); segmentList.add(i, s); } int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT); // sort the segments only if there are intermediate merges boolean sortSegments = segmentList.size() > mergeFactor; //merge TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(uniqueIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable, sortSegments, true, null, spilledRecordsCounter, null, null); // Not using any Progress in TezMerger. Should just work. //write merged output to disk long segmentStart = finalOut.getPos(); Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null, merger.needsRLE()); if (combiner == null || numSpills < minSpillsForCombine) { TezMerger.writeFile(kvIter, writer, nullProgressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT); } else { runCombineProcessor(kvIter, writer); } //close writer.close(); // record offsets final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); spillRec.putIndex(rec, parts); } spillRec.writeToFile(finalIndexFile, conf); finalOut.close(); for (int i = 0; i < numSpills; i++) { Path indexFilename = spillFileIndexPaths.get(i); Path spillFilename = spillFilePaths.get(i); rfs.delete(indexFilename, true); rfs.delete(spillFilename, true); } spillFileIndexPaths.clear(); spillFilePaths.clear(); }
From source file:org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.java
License:Apache License
private void mergeAll() throws IOException { long expectedSize = spilledSize; if (currentBuffer.nextPosition != 0) { expectedSize += currentBuffer.nextPosition - (currentBuffer.numRecords * META_SIZE) - currentBuffer.skipSize + numPartitions * APPROX_HEADER_LENGTH; // Update final statistics. updateGlobalStats(currentBuffer); }/*ww w . ja va 2 s . c om*/ long indexFileSizeEstimate = numPartitions * Constants.MAP_OUTPUT_INDEX_RECORD_LENGTH; finalOutPath = outputFileHandler.getOutputFileForWrite(expectedSize); finalIndexPath = outputFileHandler.getOutputIndexFileForWrite(indexFileSizeEstimate); TezSpillRecord finalSpillRecord = new TezSpillRecord(numPartitions); DataInputBuffer keyBuffer = new DataInputBuffer(); DataInputBuffer valBuffer = new DataInputBuffer(); DataInputBuffer keyBufferIFile = new DataInputBuffer(); DataInputBuffer valBufferIFile = new DataInputBuffer(); FSDataOutputStream out = null; try { out = rfs.create(finalOutPath); Writer writer = null; for (int i = 0; i < numPartitions; i++) { long segmentStart = out.getPos(); if (numRecordsPerPartition[i] == 0) { LOG.info("Skipping partition: " + i + " in final merge since it has no records"); continue; } writer = new Writer(conf, out, keyClass, valClass, codec, null, null); try { if (currentBuffer.nextPosition != 0 && currentBuffer.partitionPositions[i] != WrappedBuffer.PARTITION_ABSENT_POSITION) { // Write current buffer. writePartition(currentBuffer.partitionPositions[i], currentBuffer, writer, keyBuffer, valBuffer); } synchronized (spillInfoList) { for (SpillInfo spillInfo : spillInfoList) { TezIndexRecord indexRecord = spillInfo.spillRecord.getIndex(i); if (indexRecord.getPartLength() == 0) { // Skip empty partitions within a spill continue; } FSDataInputStream in = rfs.open(spillInfo.outPath); in.seek(indexRecord.getStartOffset()); IFile.Reader reader = new IFile.Reader(in, indexRecord.getPartLength(), codec, null, additionalSpillBytesReadCounter, ifileReadAhead, ifileReadAheadLength, ifileBufferSize); while (reader.nextRawKey(keyBufferIFile)) { // TODO Inefficient. If spills are not compressed, a direct copy should be possible // given the current IFile format. Also exteremely inefficient for large records, // since the entire record will be read into memory. reader.nextRawValue(valBufferIFile); writer.append(keyBufferIFile, valBufferIFile); } reader.close(); } } writer.close(); fileOutputBytesCounter.increment(writer.getCompressedLength()); TezIndexRecord indexRecord = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength()); writer = null; finalSpillRecord.putIndex(indexRecord, i); } finally { if (writer != null) { writer.close(); } } } } finally { if (out != null) { out.close(); } } finalSpillRecord.writeToFile(finalIndexPath, conf); fileOutputBytesCounter.increment(indexFileSizeEstimate); LOG.info("Finished final spill after merging : " + numSpills.get() + " spills"); }
From source file:org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.java
License:Apache License
private void writeLargeRecord(final Object key, final Object value, final int partition, final int spillNumber) throws IOException { numAdditionalSpillsCounter.increment(1); long size = sizePerBuffer - (currentBuffer.numRecords * META_SIZE) - currentBuffer.skipSize + numPartitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; long outSize = 0; try {/*from w ww. j ava2s. com*/ final TezSpillRecord spillRecord = new TezSpillRecord(numPartitions); final Path outPath = outputFileHandler.getSpillFileForWrite(spillNumber, size); out = rfs.create(outPath); for (int i = 0; i < numPartitions; i++) { final long recordStart = out.getPos(); if (i == partition) { spilledRecordsCounter.increment(1); Writer writer = null; try { writer = new IFile.Writer(conf, out, keyClass, valClass, codec, null, null); writer.append(key, value); outputLargeRecordsCounter.increment(1); numRecordsPerPartition[i]++; writer.close(); additionalSpillBytesWritternCounter.increment(writer.getCompressedLength()); TezIndexRecord indexRecord = new TezIndexRecord(recordStart, writer.getRawLength(), writer.getCompressedLength()); spillRecord.putIndex(indexRecord, i); outSize = writer.getCompressedLength(); writer = null; } finally { if (writer != null) { writer.close(); } } } } SpillInfo spillInfo = new SpillInfo(spillRecord, outPath); spillInfoList.add(spillInfo); LOG.info("Finished writing large record of size " + outSize + " to spill file " + spillNumber); } finally { if (out != null) { out.close(); } } }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId) throws IOException { List<TestRecord> recordSet = ArcFileReaderTests .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz"); FSDataOutputStream os = fs.create(filePath); try {// w ww . j a v a 2s. co m // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long streamPos = os.getPos(); long testAttemptTime = System.currentTimeMillis(); NIOHttpHeaders testHeaders = new NIOHttpHeaders(); testHeaders.add("test", "test-value"); for (TestRecord record : recordSet) { long preWritePos = os.getPos(); ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); long postWritePos = os.getPos(); record.streamPos = (int) preWritePos; record.rawSize = (int) (postWritePos - preWritePos); } os.flush(); } finally { os.close(); } return new Pair<Path, List<TestRecord>>(filePath, recordSet); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId) throws IOException { List<TestRecord> recordSet = ArcFileReaderTests .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz"); FSDataOutputStream os = fs.create(filePath); try {// w w w. j av a2 s. c o m // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); NIOHttpHeaders testHeaders = new NIOHttpHeaders(); testHeaders.add("test", "test-value"); for (TestRecord record : recordSet) { long preWritePos = os.getPos(); ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); long postWritePos = os.getPos(); record.streamPos = (int) preWritePos; record.rawSize = (int) (postWritePos - preWritePos); } os.flush(); } finally { os.close(); } return new Pair<Path, List<TestRecord>>(filePath, recordSet); }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem, Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath, Path inlinksDetailOutputPath) throws IOException { long recordCount = 0L; outputFileSystem.delete(inlinksDomainIndexPath); outputFileSystem.delete(inlinksDetailOutputPath); FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath); try {//from w w w . ja v a2s .co m FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath); FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath); ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>(); try { LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath); CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream); InlinkingDomainInfo lastDomain = null; while (reader.hasNext()) { // read the nex fingerprint URLFPV2 fingerprint = reader.next(); // and first see if we have a domain transition if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) { // remember the domain lastDomain = new InlinkingDomainInfo(); lastDomain.setDomainId(fingerprint.getDomainHash()); // add it to the list domainList.add(lastDomain); // update date position lastDomain.setUrlDataPos(detailOutputStream.getPos()); } // increment url count for the domain lastDomain.setUrlCount(lastDomain.getUrlCount() + 1); detailOutputStream.writeLong(fingerprint.getDomainHash()); detailOutputStream.writeLong(fingerprint.getUrlHash()); recordCount++; } LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records"); // ok, now resolve domain names for (InlinkingDomainInfo domain : domainList) { SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId()); if (metadata == null) { LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId()); } else { if (metadata.getDomainText().length() == 0) { LOG.error("*** Metadata for Domain Id:" + domain.getDomainId() + " contained NULL Name Value."); domain.setDomainName("_ERROR:BAD RECORD"); } else { domain.setDomainName(metadata.getDomainText()); } //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount()); } } LOG.info("Sorting Domain List of Size:" + domainList.size()); // ok sort by domain name Collections.sort(domainList); LOG.info("Building In Memory Index"); // ok write out domain info DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer(); DataOutputBuffer indexDataBuffer = new DataOutputBuffer(); LOG.info("***Writing Domain List Size:" + domainList.size()); indexHeaderBuffer.writeInt(domainList.size()); // ok iterate and write to both buffers for (InlinkingDomainInfo domain : domainList) { indexHeaderBuffer.writeInt(indexDataBuffer.getLength()); domain.write(indexDataBuffer); } LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:" + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength()); // ok now flush both buffers to disk indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength()); indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength()); } finally { indexOutputStream.flush(); indexOutputStream.close(); detailOutputStream.flush(); detailOutputStream.close(); } } finally { remoteInputStream.close(); } return recordCount; }
From source file:sg.edu.astar.dsi.mergespill.App.java
public synchronized static void doProcess(String directory, int spillNumber) throws IOException, InterruptedException { // TODO code application logic here System.out.println("directory: " + directory); System.out.println("numberOfSpill: " + spillNumber); //SETUP/*from w w w. j a va2 s. c o m*/ JobConf job = new JobConf(); //job.setMapOutputKeyClass(Text.class); job.setMapOutputKeyClass(TextDsi.class); job.setMapOutputValueClass(IntWritable.class); //Class<Text> keyClass = (Class<Text>)job.getMapOutputKeyClass(); Class<TextDsi> keyClass = (Class<TextDsi>) job.getMapOutputKeyClass(); Class<IntWritable> valClass = (Class<IntWritable>) job.getMapOutputValueClass(); FileSystem rfs; CompressionCodec codec = null; Counters.Counter spilledRecordsCounter = null; rfs = ((LocalFileSystem) FileSystem.getLocal(job)).getRaw(); while (!new File(directory).isDirectory()) { sleep(5000); } if (new File(directory).isDirectory()) { ArrayList<Path> spillFile = new ArrayList(); ArrayList<Path> spillFileIndex = new ArrayList(); App myApp; myApp = new App(); myApp.getSpillFilesAndIndices(new File(directory), spillFile, spillFileIndex, spillNumber); ArrayList<SpillRecord> indexCacheList = new ArrayList<>(); int numSpills = 0; Iterator itrSpillFileIndex = spillFileIndex.iterator(); while (itrSpillFileIndex.hasNext()) { numSpills++; Path temp = (Path) itrSpillFileIndex.next(); System.out.println(temp); SpillRecord sr = new SpillRecord(temp, job); indexCacheList.add(sr); System.out.println("indexFile partition size: " + sr.size()); long startOffset = 0; for (int i = 0; i < sr.size(); i++) { //sr.size is the number of partitions IndexRecord ir = sr.getIndex(i); System.out.println("index[" + i + "] rawLength = " + ir.rawLength); System.out.println("index[" + i + "] partLength = " + ir.partLength); System.out.println("index[" + i + "] startOffset= " + ir.startOffset); startOffset = ir.startOffset; } System.out.println("========================================"); } System.out.println("Number of spills: " + numSpills); //FinalOutputFile Path finalOutputFile = new Path(directory + File.separator + "FINALOUTPUTFILE"); FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096); System.out.println("GOT HERE 1"); Path finalIndexFile = new Path(directory + File.separator + "FINALOUTPUTFILE.index"); //ONE PARTITION ONLY List<Segment<TextDsi, IntWritable>> segmentList = new ArrayList<>(numSpills); for (int i = 0; i < numSpills; i++) { IndexRecord theIndexRecord = indexCacheList.get(i).getIndex(0); Path temp = spillFileIndex.get(i); String temp1 = temp.toString(); String temp2 = temp1.substring(0, temp1.length() - 6); //System.out.println(temp2); //System.out.println(new Path(temp2).getParent()); //File myFile = new File(temp2); //System.out.println(myFile.getPath()); Segment<TextDsi, IntWritable> s = new Segment<>(job, rfs, new Path(temp2), theIndexRecord.startOffset, theIndexRecord.partLength, codec, true); segmentList.add(i, s); } System.out.println("GOT HERE 2"); RawKeyValueIterator kvIter = Merger.merge(job, rfs, keyClass, valClass, null, segmentList, 4, new Path("/home/hduser/spillSample2/My"), job.getOutputKeyComparator(), null, false, null, spilledRecordsCounter, null, TaskType.MAP); System.out.println("GOT HERE 3"); //write merged output to disk long segmentStart = finalOut.getPos(); FSDataOutputStream finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut); Writer<TextDsi, IntWritable> writer = new Writer<TextDsi, IntWritable>(job, finalPartitionOut, TextDsi.class, IntWritable.class, codec, spilledRecordsCounter); System.out.println("GOT HERE 4"); Merger.writeFile(kvIter, writer, null, job); writer.close(); finalOut.close(); System.out.println("GOT HERE 5"); IndexRecord rec = new IndexRecord(); final SpillRecord spillRec = new SpillRecord(1); rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job); rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job); System.out.println("rec.startOffset: " + rec.startOffset); System.out.println("rec.rawLength : " + rec.rawLength); System.out.println("rec.partLength : " + rec.partLength); spillRec.putIndex(rec, 0); spillRec.writeToFile(finalIndexFile, job); System.out.println("GOT HERE 6"); } else { System.out.println("argument is not a directory! : " + directory); } }