Example usage for org.apache.hadoop.io DataInputBuffer reset

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer reset.

Prototype

public void reset(byte[] input, int length)

Source Link

Document

Resets the data that the buffer reads.

Usage

From source file:org.apache.tez.service.impl.ContainerRunnerImpl.java

License:Apache License

/**
 * Submit an entire work unit - containerId + TaskSpec.
 * This is intended for a task push from the AM
 *
 * @param request//from www .j  av  a 2 s. co  m
 * @throws org.apache.tez.dag.api.TezException
 */
@Override
public void submitWork(SubmitWorkRequestProto request) throws TezException {
    LOG.info("Queuing work for execution: " + request);

    checkAndThrowExceptionForTests(request);

    Map<String, String> env = new HashMap<String, String>();
    env.putAll(localEnv);
    env.put(ApplicationConstants.Environment.USER.name(), request.getUser());

    String[] localDirs = new String[localDirsBase.length];

    // Setup up local dirs to be application specific, and create them.
    for (int i = 0; i < localDirsBase.length; i++) {
        localDirs[i] = createAppSpecificLocalDir(localDirsBase[i], request.getApplicationIdString(),
                request.getUser());
        try {
            localFs.mkdirs(new Path(localDirs[i]));
        } catch (IOException e) {
            throw new TezException(e);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Dirs are: " + Arrays.toString(localDirs));
    }

    // Setup workingDir. This is otherwise setup as Environment.PWD
    // Used for re-localization, to add the user specified configuration (conf_pb_binary_stream)
    String workingDir = localDirs[0];

    Credentials credentials = new Credentials();
    DataInputBuffer dib = new DataInputBuffer();
    byte[] tokenBytes = request.getCredentialsBinary().toByteArray();
    dib.reset(tokenBytes, tokenBytes.length);
    try {
        credentials.readTokenStorageStream(dib);
    } catch (IOException e) {
        throw new TezException(e);
    }

    Token<JobTokenIdentifier> jobToken = TokenCache.getSessionToken(credentials);

    // TODO Unregistering does not happen at the moment, since there's no signals on when an app completes.
    LOG.info("Registering request with the ShuffleHandler for containerId {}", request.getContainerIdString());
    ShuffleHandler.get().registerApplication(request.getApplicationIdString(), jobToken, request.getUser());
    TaskRunnerCallable callable = new TaskRunnerCallable(request, new Configuration(getConfig()),
            new ExecutionContextImpl(localAddress.get().getHostName()), env, localDirs, workingDir, credentials,
            memoryPerExecutor);
    ListenableFuture<ContainerExecutionResult> future = executorService.submit(callable);
    Futures.addCallback(future, new TaskRunnerCallback(request, callable));
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java

License:Open Source License

private static void compareKeys(RawComparator<TextBytes> comparator, TextBytes key1, TextBytes key2,
        int expectedResult) {
    long nanoStart = System.nanoTime();
    Assert.assertEquals(comparator.compare(key1, key2), expectedResult);
    long nanoEnd = System.nanoTime();
    System.out.println("Object Comparison Took:" + (nanoEnd - nanoStart));
    DataOutputBuffer outputBuffer1 = new DataOutputBuffer();
    DataOutputBuffer outputBuffer2 = new DataOutputBuffer();
    try {/*from  w w w  .j  av a 2  s .  c o  m*/
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        nanoStart = System.nanoTime();
        Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(),
                outputBuffer2.getData(), 0, outputBuffer2.getLength()), expectedResult);
        nanoEnd = System.nanoTime();
        System.out.println("Raw Comparison Took:" + (nanoEnd - nanoStart));
        int offset1 = outputBuffer1.getLength();
        int offset2 = outputBuffer2.getLength();
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        Assert.assertEquals(
                comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1,
                        outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2),
                expectedResult);

        if (comparator instanceof LinkKeyComparator) {
            DataInputBuffer inputStream1 = new DataInputBuffer();
            DataInputBuffer inputStream2 = new DataInputBuffer();

            inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength());
            inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength());

            CrawlDBKey cdbkey1 = new CrawlDBKey();
            CrawlDBKey cdbkey2 = new CrawlDBKey();

            cdbkey1.readFields(inputStream1);
            cdbkey2.readFields(inputStream2);

            CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator();
            System.out.println("*Comparing Using CrawlDBKey Comparator");
            nanoStart = System.nanoTime();
            Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2), expectedResult);
            nanoEnd = System.nanoTime();
            System.out.println("Typed Comparison Took:" + (nanoEnd - nanoStart));

        }

    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

private static void rawValueToTextBytes(DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer,
        TextBytes textOut) throws IOException {
    inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength());
    int newLength = WritableUtils.readVInt(inputBuffer);
    textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength);
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

private static void rawValueToWritable(RawRecordValue rawValue, DataInputBuffer inputBuffer, Writable typeOut)
        throws IOException {
    inputBuffer.reset(rawValue.data.getData(), rawValue.data.getLength());
    typeOut.readFields(inputBuffer);//from  w  w w .  j a  va  2s  .c o  m
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria,
        ItemUpdater targetList) throws IOException {

    // ok copy stuff locally if possible ...
    File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
    File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
    File localBloomFilterPath = new File(getLocalDataDir(),
            CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);

    // ok copy local first
    if (!localIndexPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:"
                + localIndexPath.getAbsolutePath());
        try {//  ww  w .ja  va 2s .  c  o m
            _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localIndexPath.delete();
            throw e;
        }
    }
    if (!localDataPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:"
                + localDataPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localDataPath.delete();
            throw e;
        }

    }
    if (!localBloomFilterPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:"
                + localBloomFilterPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localBloomFilterPath.delete();
            throw e;
        }

    }

    // ok open local
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
            new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

    try {
        URLFP firstIndexKey = null;
        URLFP lastIndexKey = new URLFP();
        LongWritable position = new LongWritable();
        while (indexReader.next(lastIndexKey, position)) {
            if (firstIndexKey == null) {
                try {
                    firstIndexKey = (URLFP) lastIndexKey.clone();
                } catch (CloneNotSupportedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:"
                + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:"
                + lastIndexKey.getUrlHash());

        URLFP criteriaFirstKey = criteria.first();
        URLFP criteriaLastKey = criteria.last();

        if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) {
            LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
            LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash()
                    + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:"
                    + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash());
            return;
        }
    } finally {
        indexReader.close();
    }

    LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan");
    // load bloom filter
    FSDataInputStream bloomFilterStream = localFileSystem
            .open(new Path(localBloomFilterPath.getAbsolutePath()));

    int hitCount = 0;

    try {
        URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);

        URLFP fpOut = new URLFP();
        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
        DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
        ValueBytes valueBytes = null;
        DataInputBuffer valueReader = new DataInputBuffer();
        DataOutputBuffer keyBytes = new DataOutputBuffer();
        DataInputBuffer keyReader = new DataInputBuffer();

        URLFP lastFP = null;

        outerLoop:
        // now iterate each item in the criteria
        for (URLFP targetFP : criteria) {
            // if fingerprint is present in filter ...
            if (filter.isPresent(targetFP)) {
                // check to see if reader is initialzied ...
                if (reader == null) {
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()),
                            CrawlEnvironment.getHadoopConfig());
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    valueBytes = reader.createValueBytes();
                }

                // if last read fingerprint was not null ...
                if (lastFP != null) {
                    // does it match the current item
                    if (lastFP.compareTo(targetFP) == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);
                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash()
                                + " File:" + dataFilePath);
                        // if so, null out last fp
                        lastFP = null;
                        // and update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;

                        continue;
                    }
                }

                // ok at this point .. read the next item in the list ...
                lastFP = null;

                while (reader.nextRaw(keyBytes, valueBytes) != -1) {
                    // init reader ...
                    keyReader.reset(keyBytes.getData(), keyBytes.getLength());
                    // read key
                    fpOut.readFields(keyReader);
                    // reset output buffer
                    keyBytes.reset();

                    // LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
                    // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
                    // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
                    // compare it to target ...
                    int result = fpOut.compareTo(targetFP);
                    // ok does it match .. ?
                    if (result == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);

                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash()
                                + " File:" + dataFilePath);
                        // update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;
                        // and break to outer loop
                        continue outerLoop;
                    } else if (result == 1) {
                        // LOG.info("LIST:" + listId +
                        // " FP Comparison Returned 1. Going to OuterLoop");
                        // update last FP
                        lastFP = fpOut;
                        // continue outer loop
                        continue outerLoop;
                    } else {
                        // otherwise skip
                    }
                }
                // ok if we got here .. we are done reading the sequence file and did
                // not find a trailing match
                LOG.warn("LIST:" + listId
                        + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
                        + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")");
                // break out of outer loop

                break;
            }
        }
    } finally {
        bloomFilterStream.close();

        if (reader != null) {
            reader.close();
        }

        LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount);
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private final int getOffsetForSubDomainData(long domainHash) throws IOException {
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int low = 0;/*from   w  w w  . j a  v  a 2  s.c  om*/
    int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1;

    while (low <= high) {

        int mid = low + ((high - low) / 2);

        inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength());
        inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);

        // deserialize
        long hash = inputBuffer.readLong();

        // now compare it against desired hash value ...
        int comparisonResult = ((Long) hash).compareTo(domainHash);

        if (comparisonResult > 0)
            high = mid - 1;
        else if (comparisonResult < 0)
            low = mid + 1;
        else {
            return inputBuffer.readInt();
        }
    }
    throw new IOException("NOT-FOUND!");
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void resetSubDomainCounts() throws IOException {

    LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");

    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

        try {/*from  www  . j a  v a 2  s .  co m*/
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
                inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                // ok reset everything except hashes and first/last url pointers 
                int urlCount = newMetadata.getUrlCount();
                long firstRecordOffset = newMetadata.getFirstRecordOffset();
                long lastRecordOffset = newMetadata.getLastRecordOffset();
                String domainName = newMetadata.getDomainName();
                long domainHash = newMetadata.getDomainHash();

                // reset 
                newMetadata.clear();
                // restore 
                newMetadata.setUrlCount(urlCount);
                newMetadata.setFirstRecordOffset(firstRecordOffset);
                newMetadata.setLastRecordOffset(lastRecordOffset);
                newMetadata.setDomainName(domainName);
                newMetadata.setDomainHash(domainHash);

                // serialize it ... 
                outputBuffer.reset();
                newMetadata.serialize(outputBuffer, new BinaryProtocol());
                // write it back to disk 
                file.seek(orignalPos);
                // and rewrite it ... 
                file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void loadSubDomainMetadataFromDisk() throws IOException {
    LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

        try {//from www  . ja  v  a 2s .c  o  m
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();
            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos);
            }

            // write lookup table 
            _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
            for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                _offsetLookupTable.writeLong(entry.getKey());
                _offsetLookupTable.writeInt(entry.getValue());
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
    } else {

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");

        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

        try {

            //ok rebuild top level metadata as well 
            _metadata.clear();

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

            int processedCount = 0;
            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                // store offset in item 
                item._fileOffset = position;
                // load from disk 
                item.deserialize(fixedDataReader);
                try {
                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();

                    // get metadata object for subdomain 
                    CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

                    // increment url count 
                    subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

                    // increment top level metadata count 
                    _metadata.setUrlCount(_metadata.getUrlCount() + 1);

                    // update top level metadata ..
                    updateMetadata(item, _metadata, 0);

                    // update sub-domain metadata object  from item data
                    updateMetadata(item, subDomainMetadata, 0);

                    ++processedCount;
                } catch (IOException e) {
                    LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
                    LOG.error("Exception:" + CCStringUtils.stringifyException(e));
                    LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                            + stringDataReader.getFilePointer());
                }

                if (processedCount % 10000 == 0) {
                    LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
                }
            }

            // ok commit top level metadata to disk as well 
            writeMetadataToDisk();

        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                    + stringDataReader.getFilePointer());
            _queueState = QueueState.QUEUED;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

        // write metadat to disk 
        writeInitialSubDomainMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) {
    synchronized (_metadata) {

        ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();

        try {//w  ww  . j  a  v a  2  s  . c o m
            synchronized (_subDomainMetadataFile) {
                RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
                DataInputBuffer inputBuffer = new DataInputBuffer();
                byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

                try {
                    // skip version 
                    file.read();
                    // read item count 
                    int itemCount = file.readInt();

                    int i = offset;
                    int end = Math.min(i + count, itemCount);

                    LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                    if (i < itemCount) {

                        file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));

                        CrawlListMetadata newMetadata = new CrawlListMetadata();

                        for (; i < end; ++i) {

                            long orignalPos = file.getFilePointer();
                            file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                            inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                            newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                            itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata));
                        }
                    }
                } finally {
                    file.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");

        return itemsOut;
    }
}

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 3) {
        LOG.error("args: [candidate Timestamp] [drive count] [query string]");
    }/*from  w w  w. ja va 2  s. c  o m*/

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();
    CrawlEnvironment.setHadoopConfig(conf);

    long candidateTS = Long.parseLong(args[0]);
    int driveCount = Integer.parseInt(args[1]);
    String queryString = args[2];

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS, null);
        SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS);

        // ok hit the domain against the master index first ...
        LOG.info("Querying master index for DomainId Given DomainName:" + queryString);
        long domainId = masterIndex.queryDomainIdGivenDomain(queryString);

        LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId);
        SubDomainMetadata subDomainMeta = masterIndex.queryDomainMetadataGivenDomainId(domainId);

        if (subDomainMeta != null) {

            LOG.info("Metadata is present. Deserializing");
            // dump some fields ...
            LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount()
                    + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:"
                    + subDomainMeta.getHasPageRankCount());

            // ok time to dive into a url list ...

            // query for a list of urls sorted by name
            LOG.info("Querying for URLList for Domain BY PR");
            FlexBuffer urlListBufferByPR = slaveIndex.queryURLListSortedByPR(domainId);

            if (urlListBufferByPR != null) {

                // read the list ...
                DataInputBuffer readerStream = new DataInputBuffer();
                readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR.getCount());
                int totalItemCount = urlListBufferByPR.getCount() / 8;
                System.out.println("List BY  PR totalCount:" + totalItemCount);

                // initialize a fingerprint object to use for queries ...
                URLFPV2 queryFP = new URLFPV2();

                queryFP.setDomainHash(domainId);

                DataInputBuffer metadataReaderStream = new DataInputBuffer();
                // iterate the first N items ranked by page rank
                for (int i = 0; i < Math.min(10, totalItemCount); ++i) {

                    queryFP.setUrlHash(readerStream.readLong());

                    // and for metadata
                    MetadataOut urlMetadata = masterIndex.queryMetadataAndURLGivenFP(queryFP);

                    if (urlMetadata != null) {

                        // decode the url
                        String url = urlMetadata.url.toString();

                        System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url);
                        if (urlMetadata.datumAndMetadataBytes.getLength() == 0) {
                            System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!");
                        } else {

                            // explode metadata
                            CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();

                            metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes.getBytes(),
                                    urlMetadata.datumAndMetadataBytes.getOffset(),
                                    urlMetadata.datumAndMetadataBytes.getLength());
                            metadataObject.readFields(metadataReaderStream);

                            // ok at this point spit out stuff for this url
                            StringBuilder urlInfo = new StringBuilder();

                            urlInfo.append("    FetchStatus:"
                                    + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n");
                            urlInfo.append("    PageRank:" + metadataObject.getMetadata().getPageRank() + "\n");
                            urlInfo.append(
                                    "    ContentType:" + metadataObject.getMetadata().getContentType() + "\n");
                            urlInfo.append("    ArcFileInfoCount:"
                                    + metadataObject.getMetadata().getArchiveInfo().size());
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) {
                                urlInfo.append(
                                        "    HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo()
                                                + ":" + metadataObject.getMetadata().getLinkDBOffset());
                            }
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                urlInfo.append("    HasINVLinkDataInfo:"
                                        + metadataObject.getMetadata().getInverseDBFileNo() + ":"
                                        + metadataObject.getMetadata().getInverseDBOffset());
                            }
                            System.out.println(urlInfo.toString());

                            // now if inverse link data is present ..
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                // get it ...
                                System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash());
                                FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP,
                                        metadataObject.getMetadata().getInverseDBFileNo(),
                                        metadataObject.getMetadata().getInverseDBOffset());

                                if (inlinks != null) {
                                    System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount());
                                    FileSystem localFS = FileSystem.getLocal(conf);
                                    File testDir = new File("/tmp/dbIndexTest");
                                    File testFile = new File("/tmp/dbIndexTestFile");
                                    localFS.delete(new Path(testDir.getAbsolutePath()), true);
                                    localFS.delete(new Path(testFile.getAbsolutePath()), false);
                                    localFS.mkdirs(new Path(testDir.getAbsolutePath()));

                                    LOG.info("Creating Spill File of Inlinks");
                                    spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir,
                                            new Path(testFile.getAbsolutePath()), inlinks);
                                    LOG.info("Created Spill File of Inlinks");

                                    LOG.info("Reading Inlinks");
                                    // ok now open it up and dump the first few inlinks from the
                                    // spill file
                                    SequenceFile.Reader reader = new SequenceFile.Reader(localFS,
                                            new Path(testFile.getAbsolutePath()), conf);

                                    TextBytes key = new TextBytes();
                                    TriTextBytesTuple value = new TriTextBytesTuple();
                                    CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata();
                                    DataInputBuffer inputBuffer = new DataInputBuffer();

                                    try {
                                        int itemCount = 0;

                                        while (reader.next(key, value)) {

                                            if (value.getThirdValue().getLength() != 0) {
                                                inputBuffer.reset(value.getThirdValue().getBytes(), 0,
                                                        value.getThirdValue().getLength());
                                                metadata.readFields(inputBuffer);
                                                System.out.println("INLINK:" + key.toString()
                                                        + " METADATA STATUS:"
                                                        + CrawlDatum.getStatusName(metadata.getStatus()));
                                            } else {
                                                System.out.println("INLINK:" + key.toString() + " NOMETADATA");
                                            }

                                            if (++itemCount == 500) {
                                                break;
                                            }
                                        }
                                    } finally {
                                        reader.close();
                                    }

                                    LOG.info("Done Reding Inlinks");
                                }
                            }
                        }
                    } else {
                        LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL");
                    }
                }
            }
        }

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}