Example usage for org.apache.hadoop.io WritableUtils readVInt

Introduction

In this page you can find the example usage for org.apache.hadoop.io WritableUtils readVInt.

Prototype

public static int readVInt(DataInput stream) throws IOException

Source Link

Document

Reads a zero-compressed encoded integer from input stream and returns it.

Usage

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

public static void dumpUnCrawledItems(File dataDir, long listId, File outputFilePath,
        boolean includeRobotsExcludedItems) throws IOException {

    File fixedDataFile = new File(dataDir, LIST_VALUE_MAP_PREFIX + Long.toString(listId));
    File variableDataFile = new File(dataDir, LIST_STRING_MAP_PREFIX + Long.toString(listId));

    LOG.info("FixedDataFile is:" + fixedDataFile);
    LOG.info("VariableDataFile is:" + variableDataFile);

    RandomAccessFile fixedDataReader = new RandomAccessFile(fixedDataFile, "r");
    RandomAccessFile stringDataReader = new RandomAccessFile(variableDataFile, "r");

    JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(outputFilePath), 1024 * 1024 * 10));

    writer.setIndent(" ");

    try {//from  w  w w  . j  av a2s  . co  m
        writer.beginObject();
        writer.name("urls");
        writer.beginArray();
        try {

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();
            URLFP fingerprint = new URLFP();

            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                item.deserialize(fixedDataReader);

                // seek to string data 
                stringDataReader.seek(item._stringsOffset);
                // and skip buffer length 
                WritableUtils.readVInt(stringDataReader);
                // and read primary string 
                String url = stringDataReader.readUTF();
                // setup fingerprint 
                fingerprint.setDomainHash(item._domainHash);
                fingerprint.setUrlHash(item._urlFingerprint);

                // any item that has not been crawled needs to be queued 
                boolean queueItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS);

                // if item is not queued, check to see if we need to retry the item 
                if (!queueItem && item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {

                    if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {

                        queueItem = (item._redirectStatus != 0);

                        if (!queueItem) {
                            if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) {
                                queueItem = true;
                            }
                        }
                    } else {
                        queueItem = (item._crawlStatus != 0);

                        if (!queueItem) {
                            if (item._httpResultCode != 200 && item._httpResultCode != 404) {
                                queueItem = true;
                            }
                        }
                    }
                }

                if (queueItem) {
                    // ok if queue item is set ... 
                    writer.beginObject();
                    writer.name("url");
                    writer.value(url);
                    writer.name("redirected");
                    writer.value((boolean) item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS));
                    writer.name("lastStatus");
                    if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) {
                        if (item._redirectStatus == 0) {
                            writer.value("HTTP-" + item._redirectHttpResult);
                        } else {
                            writer.value(CrawlURL.FailureReason.toString(item._redirectHttpResult));
                        }
                    } else {
                        if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) {
                            if (item._crawlStatus == 0) {
                                writer.value("HTTP-" + item._httpResultCode);
                            } else {
                                writer.value(CrawlURL.FailureReason.toString(item._crawlStatus));
                            }
                        } else {
                            writer.value("UNCRAWLED");
                        }
                    }
                    writer.name("updateTime");
                    writer.value(item._updateTimestamp);
                    writer.endObject();
                }
            }
        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }

        writer.endArray();
        writer.endObject();
    } catch (Exception e) {
        LOG.error(CCStringUtils.stringifyException(e));
        throw new IOException(e);
    } finally {
        writer.flush();
        writer.close();
    }

}

From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java

License:Open Source License

public static final int readURLFPAndCountFromStream(DataInput input, URLFPV2 fpOut) throws IOException {
    fpOut.setDomainHash(input.readLong());
    fpOut.setRootDomainHash(input.readLong());
    fpOut.setUrlHash(input.readLong());/*from   w ww .  j a v  a2 s.  c om*/
    return WritableUtils.readVInt(input);
}

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

private static void spillLinkDataIntoTempFileIndex(FileSystem remoteFileSystem, FileSystem localFileSystem,
        Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, File tempFilePath, Path outputFilePath,
        FlexBuffer linkData) throws IOException {

    SequenceFileSpillWriter<TextBytes, TriTextBytesTuple> outputWriter = new SequenceFileSpillWriter<TextBytes, TriTextBytesTuple>(
            localFileSystem, conf, outputFilePath, TextBytes.class, TriTextBytesTuple.class,
            new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                    PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFilePath)),
            true);//from  ww  w  .j  a v a2 s .c  om

    try {
        // ok create merge sort spill writer ...
        MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger = new MergeSortSpillWriter<TextBytes, TriTextBytesTuple>(
                conf, outputWriter, localFileSystem, new Path(tempFilePath.getAbsolutePath()), null,
                new RawKeyValueComparator<TextBytes, TriTextBytesTuple>() {

                    DataInputBuffer stream1 = new DataInputBuffer();
                    DataInputBuffer stream2 = new DataInputBuffer();
                    TriTextBytesTuple tuple1 = new TriTextBytesTuple();
                    TriTextBytesTuple tuple2 = new TriTextBytesTuple();

                    @Override
                    public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                            int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                            int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                            throws IOException {

                        stream1.reset(value1Data, value1Offset, value1Length);
                        stream2.reset(value2Data, value2Offset, value2Length);

                        // ok skip url
                        int url1Length = WritableUtils.readVInt(stream1);
                        stream1.skip(url1Length);
                        int url2Length = WritableUtils.readVInt(stream2);
                        stream2.skip(url2Length);
                        // ok now read optimized page rank stuffed in second tuple
                        WritableUtils.readVInt(stream1);
                        WritableUtils.readVInt(stream2);
                        // now read page rank
                        float pageRank1 = stream1.readFloat();
                        float pageRank2 = stream2.readFloat();

                        return (pageRank1 == pageRank2) ? 0 : (pageRank1 < pageRank2) ? -1 : 1;

                    }

                    @Override
                    public int compare(TextBytes key1, TriTextBytesTuple value1, TextBytes key2,
                            TriTextBytesTuple value2) {
                        stream1.reset(value1.getSecondValue().getBytes(), value1.getSecondValue().getLength());
                        stream2.reset(value2.getSecondValue().getBytes(), value2.getSecondValue().getLength());

                        try {
                            float pr1 = stream1.readFloat();
                            float pr2 = stream2.readFloat();

                            return (pr1 == pr2) ? 0 : pr1 < pr2 ? -1 : 1;

                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                            throw new RuntimeException();
                        }
                    }
                }, TextBytes.class, TriTextBytesTuple.class, false, null);

        try {
            long timeStart = System.currentTimeMillis();
            System.out.println(".Running Merger against to resolve tuple set ");
            index.bulkQueryURLAndMetadataGivenInputStream(remoteFileSystem, conf, tempFilePath, linkData,
                    merger);
            long timeEnd = System.currentTimeMillis();
            LOG.info(".Merged Successfully in:" + (timeEnd - timeStart));
        } finally {
            LOG.info("Closing Merger");
            merger.close();
        }
    } finally {
        LOG.info("Closing Writer");
        outputWriter.close();
    }
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,
        DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException {

    Path mergeResultsPath = new Path(
            getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName());

    LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath);

    // get a local file system object
    FileSystem localFileSystem = FileSystem.getLocal(conf);

    //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists");
    // if source merged results path does not exist ... 
    if (!localFileSystem.exists(mergeResultsPath)) {
        LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath
                + " Not Found. Checking for parts files");
        // collect parts ...
        Vector<Path> parts = new Vector<Path>();

        FileStatus fileStatusArray[] = remoteFileSystem
                .globStatus(new Path(getHDFSQueryResultsPath(), "part-*"));

        if (fileStatusArray.length == 0) {
            LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!");
            throw new IOException("Remote Component Part Files Not Found");
        }//from w  ww  .  j av  a 2s.  com

        for (FileStatus part : fileStatusArray) {
            //LOG.info("Found Part:"+ part);
            parts.add(part.getPath());
        }

        LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger");
        SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)),
                false);

        try {
            SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>(
                    remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class,

                    new RawKeyValueComparator<Text, SubDomainMetadata>() {

                        DataInputBuffer key1Stream = new DataInputBuffer();
                        DataInputBuffer key2Stream = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            key1Stream.reset(key1Data, key1Offset, key1Length);
                            key2Stream.reset(key2Data, key2Offset, key2Length);

                            WritableUtils.readVInt(key1Stream);
                            WritableUtils.readVInt(key2Stream);

                            return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(),
                                    key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(),
                                    key2Length - key2Stream.getPosition());
                        }

                        @Override
                        public int compare(Text key1, SubDomainMetadata value1, Text key2,
                                SubDomainMetadata value2) {
                            return key1.compareTo(key2);
                        }

                    });

            try {
                LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger");
                merger.mergeAndSpill(null);
                LOG.info("Execute Local for Query:" + getQueryId()
                        + " Merge Successfull.. Deleting Merge Inputs");
                for (Path inputPath : parts) {
                    remoteFileSystem.delete(inputPath, false);
                }
            } catch (IOException e) {
                LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:"
                        + CCStringUtils.stringifyException(e));
                throw e;
            } finally {
                LOG.info("** CLOSING MERGER");
                merger.close();
            }
        } finally {
            LOG.info("** FLUSHING SPILLWRITER");
            mergedFileSpillWriter.close();
        }
    }

    // now check for query specific merge file ...
    Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)
            + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));

    LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath);

    if (!localFileSystem.exists(queryResultsPath)) {

        LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath
                + " does not exist. Running sort and merge process");

        LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:"
                + queryResultsPath);
        // allocate a spill writer ...  
        SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)),
                false);

        try {

            LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter");
            // and connect it to the merge spill writer ...
            MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>(
                    conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()),
                    /*
                    new RawKeyValueComparator<Text,SubDomainMetadata>() {
                            
                      SubDomainMetadata value1 = new SubDomainMetadata();
                      SubDomainMetadata value2 = new SubDomainMetadata();
                              
                            
                      @Override
                      public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
                        return value1.getUrlCount() - value2.getUrlCount();
                      }
                            
                      @Override
                      public int compareRaw(byte[] key1Data, int key1Offset,
                          int key1Length, byte[] key2Data, int key2Offset,
                          int key2Length, byte[] value1Data, int value1Offset,
                          int value1Length, byte[] value2Data, int value2Offset,
                          int value2Length) throws IOException {
                            
                        value1.clear();
                        value2.clear();
                                
                        value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
                        value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
                                
                        return compare(null, value1, null, value2);
                      } 
                              
                    },
                    */
                    new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() {

                        @Override
                        public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value,
                                org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut)
                                throws IOException {
                            optimizedKeyOut.setLongKeyValue(value.getUrlCount());
                        }

                        @Override
                        public int getGeneratedKeyType() {
                            return OptimizedKey.KEY_TYPE_LONG;
                        }
                    }, Text.class, SubDomainMetadata.class, false, null);

            try {

                // create a vector representing the single input segment 
                Vector<Path> singleInputSegment = new Vector<Path>();

                LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:"
                        + mergeResultsPath + " as input for Merger");
                singleInputSegment.add(mergeResultsPath);

                // create a SequenceFileReader
                SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>(
                        localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class,
                        SubDomainMetadata.class);

                try {
                    LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill");
                    mergeSegmentReader.readAndSpill();
                    LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished");
                } finally {
                    if (mergeSegmentReader != null) {
                        mergeSegmentReader.close();
                    }
                }

            } finally {
                if (mergeSortSpillWriter != null) {
                    mergeSortSpillWriter.close();
                }
            }

        } finally {
            if (sortedResultsFileSpillWriter != null) {
                sortedResultsFileSpillWriter.close();
            }
        }
    }

    //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath);
    PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(
            localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class);
    //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());

    return indexFile.getRecordCount();
}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

public static void main(String[] args) {
    // initialize ...
    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    LOG.info("URL:" + args[0] + " ShardId:" + args[1]);

    try {/*from   ww w.j  a  v  a 2s.c  o m*/
        File tempFile = File.createTempFile("inverseLinksReportTest", "seq");
        try {
            FileSystem fs = FileSystem.get(conf);
            FileSystem localFileSystem = FileSystem.getLocal(conf);

            URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]);
            if (fp != null) {
                collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(),
                        localFileSystem, new Path(tempFile.getAbsolutePath()));

                SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem,
                        new Path(tempFile.getAbsolutePath()), conf);
                try {
                    FlexBuffer key = new FlexBuffer();
                    URLFPV2 src = new URLFPV2();
                    TextBytes url = new TextBytes();

                    DataInputBuffer inputBuffer = new DataInputBuffer();

                    while (reader.next(key, src)) {
                        inputBuffer.reset(key.get(), key.getOffset(), key.getCount());
                        long targetFP = inputBuffer.readLong();
                        float pageRank = inputBuffer.readFloat();
                        // ok initialize text bytes ... 
                        int textLen = WritableUtils.readVInt(inputBuffer);
                        url.set(key.get(), inputBuffer.getPosition(), textLen);
                        LOG.info("PR:" + pageRank + " URL:" + url.toString());
                    }
                } finally {
                    reader.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            // tempFile.delete();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}

From source file:org.commoncrawl.util.FlexBuffer.java

License:Open Source License

@Override
public void readFields(DataInput in) throws IOException {
    int byteCount = WritableUtils.readVInt(in);
    setCount(byteCount);//from  ww  w. ja v  a 2s  .  c o m
    if (byteCount != 0) {
        // allocate new backing store if shared
        copyOnWrite();
        // read into the array
        in.readFully(get(), getOffset(), byteCount);
    }
}

From source file:org.commoncrawl.util.shared.FlexBuffer.java

License:Apache License

@Override
public void readFields(DataInput in) throws IOException {
    int byteCount = WritableUtils.readVInt(in);
    // first zero count ... 
    setCount(0);//from   w ww .j  ava2s  .  co  m
    // then set count 
    setCount(byteCount);
    if (byteCount != 0) {
        // allocate new backing store if shared
        copyOnWrite();
        // read into the array
        in.readFully(get(), getOffset(), byteCount);
    }
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

/**
 * deserialize/* w  ww.ja va 2  s .c  om*/
 */
public void readFields(DataInput in) throws IOException {
    int newLength = WritableUtils.readVInt(in);
    // ensure capacity
    setCapacity(newLength, false);
    // in case we need to, ensure we have a private copy of the underlying
    // array
    bytes.copyOnWrite();
    // read into the array
    in.readFully(bytes.get(), bytes.getOffset(), newLength);
    // reset count varaible
    bytes.setCount(newLength);
    // clear cached String pointer
    cachedUTF8 = null;
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

public static void main(String[] args) {
    // run some tests on the new code
    String aTestString = new String("A Test Strnig");
    // convert it to bytes
    byte bytes[] = aTestString.getBytes();
    // over allocate an array
    byte overAllocated[] = new byte[bytes.length * 2];
    // copy source
    System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length);
    // now allocate a TextBytes
    TextBytes textBytes = new TextBytes();
    // set the overallocated buffer as the backing store
    textBytes.set(overAllocated, bytes.length, bytes.length);
    // convert it to string first
    String toString = textBytes.toString();
    // validate equal to original
    Assert.assertTrue(aTestString.equals(toString));
    // ok now write it to output buffer
    DataOutputBuffer outputBuffer = new DataOutputBuffer();
    // write string
    try {/* w ww  .  j av a2 s  . c o  m*/
        textBytes.write(outputBuffer);
        // read length
        DataInputBuffer inputBuffer = new DataInputBuffer();
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        int encodedLength = WritableUtils.readVInt(inputBuffer);
        // validate arrays match ...
        Assert.assertTrue(encodedLength == bytes.length);
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(),
                inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0);
        // ok reset input buffer again ...
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        // read in fields
        textBytes.readFields(inputBuffer);
        // ok see if we are not using the original backing store ...
        Assert.assertTrue(textBytes.getBytes() != overAllocated);
        // validate buffers match to original
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(),
                textBytes.getOffset(), textBytes.getLength()), 0);

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:org.commoncrawl.util.WikipediaPage.java

License:Apache License

/**
 * Serializes this object.// w  ww. j a v a 2 s .c  o m
 */
public void readFields(DataInput in) throws IOException {
    int length = WritableUtils.readVInt(in);
    byte[] bytes = new byte[length];
    in.readFully(bytes, 0, length);
    WikipediaPage.readPage(this, new String(bytes));
    language = in.readUTF();
}