Example usage for org.apache.hadoop.fs FSDataOutputStream getPos

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataOutputStream getPos.

Prototype

public long getPos()

Source Link

Document

Get the current position in the output stream.

Usage

From source file:org.apache.tez.runtime.library.common.sort.impl.dflt.DefaultSorter.java

License:Apache License

/**
 * Handles the degenerate case where serialization fails to fit in
 * the in-memory buffer, so we must spill the record from collect
 * directly to a spill file. Consider this "losing".
 *//*from  w ww . j a v a 2 s  .c o m*/
private void spillSingleRecord(final Object key, final Object value, int partition) throws IOException {
    long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
    FSDataOutputStream out = null;
    try {
        // create spill file
        final TezSpillRecord spillRec = new TezSpillRecord(partitions);
        final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size);
        spillFilePaths.put(numSpills, filename);
        out = rfs.create(filename);

        // we don't run the combiner for a single record
        for (int i = 0; i < partitions; ++i) {
            IFile.Writer writer = null;
            try {
                long segmentStart = out.getPos();
                // Create a new codec, don't care!
                writer = new IFile.Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null);

                if (i == partition) {
                    final long recordStart = out.getPos();
                    writer.append(key, value);
                    // Note that our map byte count will not be accurate with
                    // compression
                    mapOutputByteCounter.increment(out.getPos() - recordStart);
                }
                writer.close();

                if (numSpills > 0) {
                    additionalSpillBytesWritten.increment(writer.getCompressedLength());
                    numAdditionalSpills.increment(1);
                    outputBytesWithOverheadCounter.setValue(0);
                } else {
                    // Set this up for the first write only. Subsequent ones will be handled in the final merge.
                    outputBytesWithOverheadCounter.increment(writer.getRawLength());
                }

                // record offsets
                TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(),
                        writer.getCompressedLength());
                spillRec.putIndex(rec, i);

                writer = null;
            } catch (IOException e) {
                if (null != writer)
                    writer.close();
                throw e;
            }
        }
        if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
            // create spill index file
            Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills,
                    partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
            spillFileIndexPaths.put(numSpills, indexFilename);
            spillRec.writeToFile(indexFilename, conf);
        } else {
            indexCacheList.add(spillRec);
            totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
        }
        ++numSpills;
    } finally {
        if (out != null)
            out.close();
    }
}

From source file:org.apache.tez.runtime.library.common.sort.impl.dflt.DefaultSorter.java

License:Apache License

private void mergeParts() throws IOException {
    // get the approximate size of the final output/index files
    long finalOutFileSize = 0;
    long finalIndexFileSize = 0;
    final Path[] filename = new Path[numSpills];
    final String taskIdentifier = outputContext.getUniqueIdentifier();

    for (int i = 0; i < numSpills; i++) {
        filename[i] = spillFilePaths.get(i);
        finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
    }/*from  ww  w  .  j a va  2s . c  om*/
    if (numSpills == 1) { //the spill is the final output
        finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename[0]);
        finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]);

        sameVolRename(filename[0], finalOutputFile);
        if (indexCacheList.size() == 0) {
            sameVolRename(spillFileIndexPaths.get(0), finalIndexFile);
        } else {
            indexCacheList.get(0).writeToFile(finalIndexFile, conf);
        }
        return;
    }

    // read in paged indices
    for (int i = indexCacheList.size(); i < numSpills; ++i) {
        Path indexFileName = spillFileIndexPaths.get(i);
        indexCacheList.add(new TezSpillRecord(indexFileName, conf));
    }

    //make correction in the length to include the sequence file header
    //lengths for each partition
    finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
    finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize);
    finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);

    //The output stream for the final single output file
    FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);

    if (numSpills == 0) {
        // TODO Change event generation to say there is no data rather than generating a dummy file
        //create dummy files

        TezSpillRecord sr = new TezSpillRecord(partitions);
        try {
            for (int i = 0; i < partitions; i++) {
                long segmentStart = finalOut.getPos();
                Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null);
                writer.close();

                TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(),
                        writer.getCompressedLength());
                // Covers the case of multiple spills.
                outputBytesWithOverheadCounter.increment(writer.getRawLength());
                sr.putIndex(rec, i);
            }
            sr.writeToFile(finalIndexFile, conf);
        } finally {
            finalOut.close();
        }
        return;
    } else {
        final TezSpillRecord spillRec = new TezSpillRecord(partitions);
        for (int parts = 0; parts < partitions; parts++) {
            //create the segments to be merged
            List<Segment> segmentList = new ArrayList<Segment>(numSpills);
            for (int i = 0; i < numSpills; i++) {
                TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);

                Segment s = new Segment(rfs, filename[i], indexRecord.getStartOffset(),
                        indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength,
                        ifileBufferSize, true);
                segmentList.add(i, s);

                if (LOG.isDebugEnabled()) {
                    LOG.debug("TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "("
                            + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", "
                            + indexRecord.getPartLength() + ")");
                }
            }

            int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR,
                    TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
            // sort the segments only if there are intermediate merges
            boolean sortSegments = segmentList.size() > mergeFactor;
            //merge
            TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList,
                    mergeFactor, new Path(taskIdentifier),
                    (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable,
                    sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null); // Not using any Progress in TezMerger. Should just work.

            //write merged output to disk
            long segmentStart = finalOut.getPos();
            Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null);
            if (combiner == null || numSpills < minSpillsForCombine) {
                TezMerger.writeFile(kvIter, writer, nullProgressable,
                        TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
            } else {
                runCombineProcessor(kvIter, writer);
            }
            writer.close();

            // record offsets
            final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(),
                    writer.getCompressedLength());
            spillRec.putIndex(rec, parts);
        }
        spillRec.writeToFile(finalIndexFile, conf);
        finalOut.close();
        for (int i = 0; i < numSpills; i++) {
            rfs.delete(filename[i], true);
        }
    }
}

From source file:org.apache.tez.runtime.library.common.sort.impl.PipelinedSorter.java

License:Apache License

public void spill() throws IOException {
    // create spill file
    final long size = capacity + +(partitions * APPROX_HEADER_LENGTH);
    final TezSpillRecord spillRec = new TezSpillRecord(partitions);
    final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size);
    spillFilePaths.put(numSpills, filename);
    FSDataOutputStream out = rfs.create(filename, true, 4096);

    try {//from w  w  w .  j  av a2 s  . c  o m
        merger.ready(); // wait for all the future results from sort threads
        LOG.info("Spilling to " + filename.toString());
        for (int i = 0; i < partitions; ++i) {
            TezRawKeyValueIterator kvIter = merger.filter(i);
            //write merged output to disk
            long segmentStart = out.getPos();
            Writer writer = new Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null,
                    merger.needsRLE());
            if (combiner == null) {
                while (kvIter.next()) {
                    writer.append(kvIter.getKey(), kvIter.getValue());
                }
            } else {
                runCombineProcessor(kvIter, writer);
            }
            //close
            writer.close();

            // record offsets
            final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(),
                    writer.getCompressedLength());
            spillRec.putIndex(rec, i);
        }

        Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills,
                partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
        spillFileIndexPaths.put(numSpills, indexFilename);
        // TODO: cache
        spillRec.writeToFile(indexFilename, conf);
        ++numSpills;
    } catch (InterruptedException ie) {
        // TODO:the combiner has been interrupted
    } finally {
        out.close();
    }
}

From source file:org.apache.tez.runtime.library.common.sort.impl.PipelinedSorter.java

License:Apache License

@Override
public void flush() throws IOException {
    final String uniqueIdentifier = outputContext.getUniqueIdentifier();
    finalOutputFile = mapOutputFile.getOutputFileForWrite(0); //TODO
    finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(0); //TODO

    LOG.info("Starting flush of map output");
    span.end();//from w  ww .  ja v a2s. c  o m
    merger.add(span.sort(sorter));
    spill();
    sortmaster.shutdown();

    //safe to clean up
    bufferList.clear();

    numAdditionalSpills.increment(numSpills - 1);

    if (numSpills == 1) {
        // someday be able to pass this directly to shuffle
        // without writing to disk
        final Path filename = spillFilePaths.get(0);
        final Path indexFilename = spillFileIndexPaths.get(0);
        finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename);
        finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(indexFilename);

        sameVolRename(filename, finalOutputFile);
        sameVolRename(indexFilename, finalIndexFile);
        return;
    }

    //The output stream for the final single output file
    FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);

    final TezSpillRecord spillRec = new TezSpillRecord(partitions);

    for (int i = 0; i < numSpills; i++) {
        // TODO: build this cache before
        Path indexFilename = spillFileIndexPaths.get(i);
        TezSpillRecord spillIndex = new TezSpillRecord(indexFilename, conf);
        indexCacheList.add(spillIndex);
    }

    for (int parts = 0; parts < partitions; parts++) {
        //create the segments to be merged
        List<Segment> segmentList = new ArrayList<Segment>(numSpills);
        for (int i = 0; i < numSpills; i++) {
            Path spillFilename = spillFilePaths.get(i);
            TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);

            Segment s = new Segment(rfs, spillFilename, indexRecord.getStartOffset(),
                    indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize,
                    true);
            segmentList.add(i, s);
        }

        int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR,
                TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
        // sort the segments only if there are intermediate merges
        boolean sortSegments = segmentList.size() > mergeFactor;
        //merge
        TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList,
                mergeFactor, new Path(uniqueIdentifier),
                (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), nullProgressable,
                sortSegments, true, null, spilledRecordsCounter, null, null); // Not using any Progress in TezMerger. Should just work.

        //write merged output to disk
        long segmentStart = finalOut.getPos();
        Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null,
                merger.needsRLE());
        if (combiner == null || numSpills < minSpillsForCombine) {
            TezMerger.writeFile(kvIter, writer, nullProgressable,
                    TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
        } else {
            runCombineProcessor(kvIter, writer);
        }

        //close
        writer.close();

        // record offsets
        final TezIndexRecord rec = new TezIndexRecord(segmentStart, writer.getRawLength(),
                writer.getCompressedLength());
        spillRec.putIndex(rec, parts);
    }

    spillRec.writeToFile(finalIndexFile, conf);
    finalOut.close();
    for (int i = 0; i < numSpills; i++) {
        Path indexFilename = spillFileIndexPaths.get(i);
        Path spillFilename = spillFilePaths.get(i);
        rfs.delete(indexFilename, true);
        rfs.delete(spillFilename, true);
    }

    spillFileIndexPaths.clear();
    spillFilePaths.clear();
}

From source file:org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.java

License:Apache License

private void mergeAll() throws IOException {
    long expectedSize = spilledSize;
    if (currentBuffer.nextPosition != 0) {
        expectedSize += currentBuffer.nextPosition - (currentBuffer.numRecords * META_SIZE)
                - currentBuffer.skipSize + numPartitions * APPROX_HEADER_LENGTH;
        // Update final statistics.
        updateGlobalStats(currentBuffer);
    }/*ww  w . ja  va 2  s .  c om*/

    long indexFileSizeEstimate = numPartitions * Constants.MAP_OUTPUT_INDEX_RECORD_LENGTH;
    finalOutPath = outputFileHandler.getOutputFileForWrite(expectedSize);
    finalIndexPath = outputFileHandler.getOutputIndexFileForWrite(indexFileSizeEstimate);

    TezSpillRecord finalSpillRecord = new TezSpillRecord(numPartitions);

    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataInputBuffer valBuffer = new DataInputBuffer();

    DataInputBuffer keyBufferIFile = new DataInputBuffer();
    DataInputBuffer valBufferIFile = new DataInputBuffer();

    FSDataOutputStream out = null;
    try {
        out = rfs.create(finalOutPath);
        Writer writer = null;

        for (int i = 0; i < numPartitions; i++) {
            long segmentStart = out.getPos();
            if (numRecordsPerPartition[i] == 0) {
                LOG.info("Skipping partition: " + i + " in final merge since it has no records");
                continue;
            }
            writer = new Writer(conf, out, keyClass, valClass, codec, null, null);
            try {
                if (currentBuffer.nextPosition != 0
                        && currentBuffer.partitionPositions[i] != WrappedBuffer.PARTITION_ABSENT_POSITION) {
                    // Write current buffer.
                    writePartition(currentBuffer.partitionPositions[i], currentBuffer, writer, keyBuffer,
                            valBuffer);
                }
                synchronized (spillInfoList) {
                    for (SpillInfo spillInfo : spillInfoList) {
                        TezIndexRecord indexRecord = spillInfo.spillRecord.getIndex(i);
                        if (indexRecord.getPartLength() == 0) {
                            // Skip empty partitions within a spill
                            continue;
                        }
                        FSDataInputStream in = rfs.open(spillInfo.outPath);
                        in.seek(indexRecord.getStartOffset());
                        IFile.Reader reader = new IFile.Reader(in, indexRecord.getPartLength(), codec, null,
                                additionalSpillBytesReadCounter, ifileReadAhead, ifileReadAheadLength,
                                ifileBufferSize);
                        while (reader.nextRawKey(keyBufferIFile)) {
                            // TODO Inefficient. If spills are not compressed, a direct copy should be possible
                            // given the current IFile format. Also exteremely inefficient for large records,
                            // since the entire record will be read into memory.
                            reader.nextRawValue(valBufferIFile);
                            writer.append(keyBufferIFile, valBufferIFile);
                        }
                        reader.close();
                    }
                }
                writer.close();
                fileOutputBytesCounter.increment(writer.getCompressedLength());
                TezIndexRecord indexRecord = new TezIndexRecord(segmentStart, writer.getRawLength(),
                        writer.getCompressedLength());
                writer = null;
                finalSpillRecord.putIndex(indexRecord, i);
            } finally {
                if (writer != null) {
                    writer.close();
                }
            }
        }
    } finally {
        if (out != null) {
            out.close();
        }
    }
    finalSpillRecord.writeToFile(finalIndexPath, conf);
    fileOutputBytesCounter.increment(indexFileSizeEstimate);
    LOG.info("Finished final spill after merging : " + numSpills.get() + " spills");
}

From source file:org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.java

License:Apache License

private void writeLargeRecord(final Object key, final Object value, final int partition, final int spillNumber)
        throws IOException {
    numAdditionalSpillsCounter.increment(1);
    long size = sizePerBuffer - (currentBuffer.numRecords * META_SIZE) - currentBuffer.skipSize
            + numPartitions * APPROX_HEADER_LENGTH;
    FSDataOutputStream out = null;
    long outSize = 0;
    try {/*from  w  ww.  j ava2s.  com*/
        final TezSpillRecord spillRecord = new TezSpillRecord(numPartitions);
        final Path outPath = outputFileHandler.getSpillFileForWrite(spillNumber, size);
        out = rfs.create(outPath);
        for (int i = 0; i < numPartitions; i++) {
            final long recordStart = out.getPos();
            if (i == partition) {
                spilledRecordsCounter.increment(1);
                Writer writer = null;
                try {
                    writer = new IFile.Writer(conf, out, keyClass, valClass, codec, null, null);
                    writer.append(key, value);
                    outputLargeRecordsCounter.increment(1);
                    numRecordsPerPartition[i]++;
                    writer.close();
                    additionalSpillBytesWritternCounter.increment(writer.getCompressedLength());
                    TezIndexRecord indexRecord = new TezIndexRecord(recordStart, writer.getRawLength(),
                            writer.getCompressedLength());
                    spillRecord.putIndex(indexRecord, i);
                    outSize = writer.getCompressedLength();
                    writer = null;
                } finally {
                    if (writer != null) {
                        writer.close();
                    }
                }
            }
        }
        SpillInfo spillInfo = new SpillInfo(spillRecord, outPath);
        spillInfoList.add(spillInfo);
        LOG.info("Finished writing large record of size " + outSize + " to spill file " + spillNumber);
    } finally {
        if (out != null) {
            out.close();
        }
    }
}

From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java

License:Apache License

static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId)
        throws IOException {
    List<TestRecord> recordSet = ArcFileReaderTests
            .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
    Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz");
    FSDataOutputStream os = fs.create(filePath);
    try {// w  ww .  j  a  v  a 2s. co m
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());
        long streamPos = os.getPos();

        long testAttemptTime = System.currentTimeMillis();
        NIOHttpHeaders testHeaders = new NIOHttpHeaders();
        testHeaders.add("test", "test-value");

        for (TestRecord record : recordSet) {
            long preWritePos = os.getPos();
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
            long postWritePos = os.getPos();
            record.streamPos = (int) preWritePos;
            record.rawSize = (int) (postWritePos - preWritePos);
        }
        os.flush();
    } finally {
        os.close();
    }
    return new Pair<Path, List<TestRecord>>(filePath, recordSet);
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static Pair<Path, List<TestRecord>> buildTestARCFile(Path directoryPath, FileSystem fs, int fileId)
        throws IOException {
    List<TestRecord> recordSet = ArcFileReaderTests
            .buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);
    Path filePath = new Path(directoryPath, Integer.toString(fileId) + ".arc.gz");
    FSDataOutputStream os = fs.create(filePath);
    try {//  w  w w.  j av a2 s. c o  m
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        NIOHttpHeaders testHeaders = new NIOHttpHeaders();
        testHeaders.add("test", "test-value");

        for (TestRecord record : recordSet) {
            long preWritePos = os.getPos();
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    testHeaders, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
            long postWritePos = os.getPos();
            record.streamPos = (int) preWritePos;
            record.rawSize = (int) (postWritePos - preWritePos);
        }
        os.flush();
    } finally {
        os.close();
    }
    return new Pair<Path, List<TestRecord>>(filePath, recordSet);
}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem,
        Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath,
        Path inlinksDetailOutputPath) throws IOException {

    long recordCount = 0L;

    outputFileSystem.delete(inlinksDomainIndexPath);
    outputFileSystem.delete(inlinksDetailOutputPath);

    FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath);

    try {//from  w  w  w  .  ja v a2s  .co m

        FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath);
        FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath);

        ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>();

        try {

            LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath);
            CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream);

            InlinkingDomainInfo lastDomain = null;

            while (reader.hasNext()) {

                // read the nex fingerprint 
                URLFPV2 fingerprint = reader.next();
                // and first see if we have a domain transition 
                if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) {
                    // remember the domain 
                    lastDomain = new InlinkingDomainInfo();
                    lastDomain.setDomainId(fingerprint.getDomainHash());
                    // add it to the list 
                    domainList.add(lastDomain);
                    // update date position 
                    lastDomain.setUrlDataPos(detailOutputStream.getPos());
                }
                // increment url count for the domain
                lastDomain.setUrlCount(lastDomain.getUrlCount() + 1);

                detailOutputStream.writeLong(fingerprint.getDomainHash());
                detailOutputStream.writeLong(fingerprint.getUrlHash());

                recordCount++;
            }

            LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records");
            // ok, now resolve domain names
            for (InlinkingDomainInfo domain : domainList) {
                SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId());
                if (metadata == null) {
                    LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId());
                } else {
                    if (metadata.getDomainText().length() == 0) {
                        LOG.error("*** Metadata for Domain Id:" + domain.getDomainId()
                                + " contained NULL Name Value.");
                        domain.setDomainName("_ERROR:BAD RECORD");
                    } else {
                        domain.setDomainName(metadata.getDomainText());
                    }
                    //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount());
                }
            }

            LOG.info("Sorting Domain List of Size:" + domainList.size());
            // ok sort by domain name 
            Collections.sort(domainList);

            LOG.info("Building In Memory Index");

            // ok write out domain info
            DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer();
            DataOutputBuffer indexDataBuffer = new DataOutputBuffer();

            LOG.info("***Writing Domain List Size:" + domainList.size());
            indexHeaderBuffer.writeInt(domainList.size());

            // ok iterate and write to both buffers  
            for (InlinkingDomainInfo domain : domainList) {
                indexHeaderBuffer.writeInt(indexDataBuffer.getLength());
                domain.write(indexDataBuffer);
            }

            LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:"
                    + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength());
            // ok now flush both buffers to disk
            indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength());
            indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength());
        } finally {
            indexOutputStream.flush();
            indexOutputStream.close();
            detailOutputStream.flush();
            detailOutputStream.close();
        }
    } finally {
        remoteInputStream.close();
    }
    return recordCount;
}

From source file:sg.edu.astar.dsi.mergespill.App.java

public synchronized static void doProcess(String directory, int spillNumber)
        throws IOException, InterruptedException {
    // TODO code application logic here
    System.out.println("directory: " + directory);
    System.out.println("numberOfSpill: " + spillNumber);
    //SETUP/*from  w w  w.  j  a va2 s. c o m*/
    JobConf job = new JobConf();
    //job.setMapOutputKeyClass(Text.class);
    job.setMapOutputKeyClass(TextDsi.class);
    job.setMapOutputValueClass(IntWritable.class);
    //Class<Text> keyClass = (Class<Text>)job.getMapOutputKeyClass();
    Class<TextDsi> keyClass = (Class<TextDsi>) job.getMapOutputKeyClass();
    Class<IntWritable> valClass = (Class<IntWritable>) job.getMapOutputValueClass();
    FileSystem rfs;
    CompressionCodec codec = null;
    Counters.Counter spilledRecordsCounter = null;
    rfs = ((LocalFileSystem) FileSystem.getLocal(job)).getRaw();

    while (!new File(directory).isDirectory()) {
        sleep(5000);
    }

    if (new File(directory).isDirectory()) {
        ArrayList<Path> spillFile = new ArrayList();
        ArrayList<Path> spillFileIndex = new ArrayList();

        App myApp;
        myApp = new App();

        myApp.getSpillFilesAndIndices(new File(directory), spillFile, spillFileIndex, spillNumber);

        ArrayList<SpillRecord> indexCacheList = new ArrayList<>();
        int numSpills = 0;

        Iterator itrSpillFileIndex = spillFileIndex.iterator();
        while (itrSpillFileIndex.hasNext()) {
            numSpills++;
            Path temp = (Path) itrSpillFileIndex.next();
            System.out.println(temp);
            SpillRecord sr = new SpillRecord(temp, job);
            indexCacheList.add(sr);

            System.out.println("indexFile partition size: " + sr.size());
            long startOffset = 0;
            for (int i = 0; i < sr.size(); i++) { //sr.size is the number of partitions
                IndexRecord ir = sr.getIndex(i);
                System.out.println("index[" + i + "] rawLength = " + ir.rawLength);
                System.out.println("index[" + i + "] partLength = " + ir.partLength);
                System.out.println("index[" + i + "] startOffset= " + ir.startOffset);
                startOffset = ir.startOffset;
            }
            System.out.println("========================================");
        }
        System.out.println("Number of spills: " + numSpills);
        //FinalOutputFile
        Path finalOutputFile = new Path(directory + File.separator + "FINALOUTPUTFILE");
        FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
        System.out.println("GOT HERE 1");
        Path finalIndexFile = new Path(directory + File.separator + "FINALOUTPUTFILE.index");

        //ONE PARTITION ONLY
        List<Segment<TextDsi, IntWritable>> segmentList = new ArrayList<>(numSpills);
        for (int i = 0; i < numSpills; i++) {
            IndexRecord theIndexRecord = indexCacheList.get(i).getIndex(0);
            Path temp = spillFileIndex.get(i);
            String temp1 = temp.toString();
            String temp2 = temp1.substring(0, temp1.length() - 6);
            //System.out.println(temp2);
            //System.out.println(new Path(temp2).getParent());
            //File myFile = new File(temp2);
            //System.out.println(myFile.getPath());
            Segment<TextDsi, IntWritable> s = new Segment<>(job, rfs, new Path(temp2),
                    theIndexRecord.startOffset, theIndexRecord.partLength, codec, true);
            segmentList.add(i, s);
        }
        System.out.println("GOT HERE 2");
        RawKeyValueIterator kvIter = Merger.merge(job, rfs, keyClass, valClass, null, segmentList, 4,
                new Path("/home/hduser/spillSample2/My"), job.getOutputKeyComparator(), null, false, null,
                spilledRecordsCounter, null, TaskType.MAP);
        System.out.println("GOT HERE 3");
        //write merged output to disk
        long segmentStart = finalOut.getPos();
        FSDataOutputStream finalPartitionOut = CryptoUtils.wrapIfNecessary(job, finalOut);
        Writer<TextDsi, IntWritable> writer = new Writer<TextDsi, IntWritable>(job, finalPartitionOut,
                TextDsi.class, IntWritable.class, codec, spilledRecordsCounter);
        System.out.println("GOT HERE 4");
        Merger.writeFile(kvIter, writer, null, job);
        writer.close();
        finalOut.close();
        System.out.println("GOT HERE 5");

        IndexRecord rec = new IndexRecord();
        final SpillRecord spillRec = new SpillRecord(1);
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
        rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
        System.out.println("rec.startOffset: " + rec.startOffset);
        System.out.println("rec.rawLength  : " + rec.rawLength);
        System.out.println("rec.partLength : " + rec.partLength);
        spillRec.putIndex(rec, 0);
        spillRec.writeToFile(finalIndexFile, job);
        System.out.println("GOT HERE 6");

    } else {
        System.out.println("argument is not a directory! : " + directory);
    }

}