Example usage for org.apache.hadoop.io DataInputBuffer reset

List of usage examples for org.apache.hadoop.io DataInputBuffer reset

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer reset.

Prototype

public void reset(byte[] input, int length) 

Source Link

Document

Resets the data that the buffer reads.

Usage

From source file:org.apache.tez.service.impl.ContainerRunnerImpl.java

License:Apache License

/**
 * Submit an entire work unit - containerId + TaskSpec.
 * This is intended for a task push from the AM
 *
 * @param request//from www .j  av  a 2 s. co  m
 * @throws org.apache.tez.dag.api.TezException
 */
@Override
public void submitWork(SubmitWorkRequestProto request) throws TezException {
    LOG.info("Queuing work for execution: " + request);

    checkAndThrowExceptionForTests(request);

    Map<String, String> env = new HashMap<String, String>();
    env.putAll(localEnv);
    env.put(ApplicationConstants.Environment.USER.name(), request.getUser());

    String[] localDirs = new String[localDirsBase.length];

    // Setup up local dirs to be application specific, and create them.
    for (int i = 0; i < localDirsBase.length; i++) {
        localDirs[i] = createAppSpecificLocalDir(localDirsBase[i], request.getApplicationIdString(),
                request.getUser());
        try {
            localFs.mkdirs(new Path(localDirs[i]));
        } catch (IOException e) {
            throw new TezException(e);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Dirs are: " + Arrays.toString(localDirs));
    }

    // Setup workingDir. This is otherwise setup as Environment.PWD
    // Used for re-localization, to add the user specified configuration (conf_pb_binary_stream)
    String workingDir = localDirs[0];

    Credentials credentials = new Credentials();
    DataInputBuffer dib = new DataInputBuffer();
    byte[] tokenBytes = request.getCredentialsBinary().toByteArray();
    dib.reset(tokenBytes, tokenBytes.length);
    try {
        credentials.readTokenStorageStream(dib);
    } catch (IOException e) {
        throw new TezException(e);
    }

    Token<JobTokenIdentifier> jobToken = TokenCache.getSessionToken(credentials);

    // TODO Unregistering does not happen at the moment, since there's no signals on when an app completes.
    LOG.info("Registering request with the ShuffleHandler for containerId {}", request.getContainerIdString());
    ShuffleHandler.get().registerApplication(request.getApplicationIdString(), jobToken, request.getUser());
    TaskRunnerCallable callable = new TaskRunnerCallable(request, new Configuration(getConfig()),
            new ExecutionContextImpl(localAddress.get().getHostName()), env, localDirs, workingDir, credentials,
            memoryPerExecutor);
    ListenableFuture<ContainerExecutionResult> future = executorService.submit(callable);
    Futures.addCallback(future, new TaskRunnerCallback(request, callable));
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java

License:Open Source License

private static void compareKeys(RawComparator<TextBytes> comparator, TextBytes key1, TextBytes key2,
        int expectedResult) {
    long nanoStart = System.nanoTime();
    Assert.assertEquals(comparator.compare(key1, key2), expectedResult);
    long nanoEnd = System.nanoTime();
    System.out.println("Object Comparison Took:" + (nanoEnd - nanoStart));
    DataOutputBuffer outputBuffer1 = new DataOutputBuffer();
    DataOutputBuffer outputBuffer2 = new DataOutputBuffer();
    try {/*from  w w w  .j  av a 2  s .  c o  m*/
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        nanoStart = System.nanoTime();
        Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(),
                outputBuffer2.getData(), 0, outputBuffer2.getLength()), expectedResult);
        nanoEnd = System.nanoTime();
        System.out.println("Raw Comparison Took:" + (nanoEnd - nanoStart));
        int offset1 = outputBuffer1.getLength();
        int offset2 = outputBuffer2.getLength();
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        Assert.assertEquals(
                comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1,
                        outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2),
                expectedResult);

        if (comparator instanceof LinkKeyComparator) {
            DataInputBuffer inputStream1 = new DataInputBuffer();
            DataInputBuffer inputStream2 = new DataInputBuffer();

            inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength());
            inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength());

            CrawlDBKey cdbkey1 = new CrawlDBKey();
            CrawlDBKey cdbkey2 = new CrawlDBKey();

            cdbkey1.readFields(inputStream1);
            cdbkey2.readFields(inputStream2);

            CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator();
            System.out.println("*Comparing Using CrawlDBKey Comparator");
            nanoStart = System.nanoTime();
            Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2), expectedResult);
            nanoEnd = System.nanoTime();
            System.out.println("Typed Comparison Took:" + (nanoEnd - nanoStart));

        }

    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

private static void rawValueToTextBytes(DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer,
        TextBytes textOut) throws IOException {
    inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength());
    int newLength = WritableUtils.readVInt(inputBuffer);
    textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength);
}

From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java

License:Open Source License

private static void rawValueToWritable(RawRecordValue rawValue, DataInputBuffer inputBuffer, Writable typeOut)
        throws IOException {
    inputBuffer.reset(rawValue.data.getData(), rawValue.data.getLength());
    typeOut.readFields(inputBuffer);//from  w  w w .  j a  va  2s  .c o  m
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria,
        ItemUpdater targetList) throws IOException {

    // ok copy stuff locally if possible ...
    File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
    File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
    File localBloomFilterPath = new File(getLocalDataDir(),
            CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);

    // ok copy local first
    if (!localIndexPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:"
                + localIndexPath.getAbsolutePath());
        try {//  ww  w .ja  va 2s .  c  o m
            _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localIndexPath.delete();
            throw e;
        }
    }
    if (!localDataPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:"
                + localDataPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localDataPath.delete();
            throw e;
        }

    }
    if (!localBloomFilterPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:"
                + localBloomFilterPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localBloomFilterPath.delete();
            throw e;
        }

    }

    // ok open local
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
            new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

    try {
        URLFP firstIndexKey = null;
        URLFP lastIndexKey = new URLFP();
        LongWritable position = new LongWritable();
        while (indexReader.next(lastIndexKey, position)) {
            if (firstIndexKey == null) {
                try {
                    firstIndexKey = (URLFP) lastIndexKey.clone();
                } catch (CloneNotSupportedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:"
                + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:"
                + lastIndexKey.getUrlHash());

        URLFP criteriaFirstKey = criteria.first();
        URLFP criteriaLastKey = criteria.last();

        if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) {
            LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
            LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash()
                    + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:"
                    + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash());
            return;
        }
    } finally {
        indexReader.close();
    }

    LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan");
    // load bloom filter
    FSDataInputStream bloomFilterStream = localFileSystem
            .open(new Path(localBloomFilterPath.getAbsolutePath()));

    int hitCount = 0;

    try {
        URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);

        URLFP fpOut = new URLFP();
        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
        DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
        ValueBytes valueBytes = null;
        DataInputBuffer valueReader = new DataInputBuffer();
        DataOutputBuffer keyBytes = new DataOutputBuffer();
        DataInputBuffer keyReader = new DataInputBuffer();

        URLFP lastFP = null;

        outerLoop:
        // now iterate each item in the criteria
        for (URLFP targetFP : criteria) {
            // if fingerprint is present in filter ...
            if (filter.isPresent(targetFP)) {
                // check to see if reader is initialzied ...
                if (reader == null) {
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()),
                            CrawlEnvironment.getHadoopConfig());
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    valueBytes = reader.createValueBytes();
                }

                // if last read fingerprint was not null ...
                if (lastFP != null) {
                    // does it match the current item
                    if (lastFP.compareTo(targetFP) == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);
                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash()
                                + " File:" + dataFilePath);
                        // if so, null out last fp
                        lastFP = null;
                        // and update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;

                        continue;
                    }
                }

                // ok at this point .. read the next item in the list ...
                lastFP = null;

                while (reader.nextRaw(keyBytes, valueBytes) != -1) {
                    // init reader ...
                    keyReader.reset(keyBytes.getData(), keyBytes.getLength());
                    // read key
                    fpOut.readFields(keyReader);
                    // reset output buffer
                    keyBytes.reset();

                    // LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
                    // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
                    // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
                    // compare it to target ...
                    int result = fpOut.compareTo(targetFP);
                    // ok does it match .. ?
                    if (result == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);

                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash()
                                + " File:" + dataFilePath);
                        // update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;
                        // and break to outer loop
                        continue outerLoop;
                    } else if (result == 1) {
                        // LOG.info("LIST:" + listId +
                        // " FP Comparison Returned 1. Going to OuterLoop");
                        // update last FP
                        lastFP = fpOut;
                        // continue outer loop
                        continue outerLoop;
                    } else {
                        // otherwise skip
                    }
                }
                // ok if we got here .. we are done reading the sequence file and did
                // not find a trailing match
                LOG.warn("LIST:" + listId
                        + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
                        + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")");
                // break out of outer loop

                break;
            }
        }
    } finally {
        bloomFilterStream.close();

        if (reader != null) {
            reader.close();
        }

        LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount);
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private final int getOffsetForSubDomainData(long domainHash) throws IOException {
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int low = 0;/*from   w  w w  . j a  v  a 2  s.c  om*/
    int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1;

    while (low <= high) {

        int mid = low + ((high - low) / 2);

        inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength());
        inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);

        // deserialize
        long hash = inputBuffer.readLong();

        // now compare it against desired hash value ...
        int comparisonResult = ((Long) hash).compareTo(domainHash);

        if (comparisonResult > 0)
            high = mid - 1;
        else if (comparisonResult < 0)
            low = mid + 1;
        else {
            return inputBuffer.readInt();
        }
    }
    throw new IOException("NOT-FOUND!");
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void resetSubDomainCounts() throws IOException {

    LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");

    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

        try {/*from  www  . j a  v a 2  s .  co m*/
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
                inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                // ok reset everything except hashes and first/last url pointers 
                int urlCount = newMetadata.getUrlCount();
                long firstRecordOffset = newMetadata.getFirstRecordOffset();
                long lastRecordOffset = newMetadata.getLastRecordOffset();
                String domainName = newMetadata.getDomainName();
                long domainHash = newMetadata.getDomainHash();

                // reset 
                newMetadata.clear();
                // restore 
                newMetadata.setUrlCount(urlCount);
                newMetadata.setFirstRecordOffset(firstRecordOffset);
                newMetadata.setLastRecordOffset(lastRecordOffset);
                newMetadata.setDomainName(domainName);
                newMetadata.setDomainHash(domainHash);

                // serialize it ... 
                outputBuffer.reset();
                newMetadata.serialize(outputBuffer, new BinaryProtocol());
                // write it back to disk 
                file.seek(orignalPos);
                // and rewrite it ... 
                file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void loadSubDomainMetadataFromDisk() throws IOException {
    LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

        try {//from www  . ja  v  a 2s .c  o  m
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();
            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos);
            }

            // write lookup table 
            _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
            for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                _offsetLookupTable.writeLong(entry.getKey());
                _offsetLookupTable.writeInt(entry.getValue());
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
    } else {

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");

        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

        try {

            //ok rebuild top level metadata as well 
            _metadata.clear();

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

            int processedCount = 0;
            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                // store offset in item 
                item._fileOffset = position;
                // load from disk 
                item.deserialize(fixedDataReader);
                try {
                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();

                    // get metadata object for subdomain 
                    CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

                    // increment url count 
                    subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

                    // increment top level metadata count 
                    _metadata.setUrlCount(_metadata.getUrlCount() + 1);

                    // update top level metadata ..
                    updateMetadata(item, _metadata, 0);

                    // update sub-domain metadata object  from item data
                    updateMetadata(item, subDomainMetadata, 0);

                    ++processedCount;
                } catch (IOException e) {
                    LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
                    LOG.error("Exception:" + CCStringUtils.stringifyException(e));
                    LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                            + stringDataReader.getFilePointer());
                }

                if (processedCount % 10000 == 0) {
                    LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
                }
            }

            // ok commit top level metadata to disk as well 
            writeMetadataToDisk();

        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                    + stringDataReader.getFilePointer());
            _queueState = QueueState.QUEUED;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

        // write metadat to disk 
        writeInitialSubDomainMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) {
    synchronized (_metadata) {

        ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();

        try {//w  ww  . j  a  v a  2  s  . c o m
            synchronized (_subDomainMetadataFile) {
                RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
                DataInputBuffer inputBuffer = new DataInputBuffer();
                byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

                try {
                    // skip version 
                    file.read();
                    // read item count 
                    int itemCount = file.readInt();

                    int i = offset;
                    int end = Math.min(i + count, itemCount);

                    LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                    if (i < itemCount) {

                        file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));

                        CrawlListMetadata newMetadata = new CrawlListMetadata();

                        for (; i < end; ++i) {

                            long orignalPos = file.getFilePointer();
                            file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                            inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                            newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                            itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata));
                        }
                    }
                } finally {
                    file.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");

        return itemsOut;
    }
}

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 3) {
        LOG.error("args: [candidate Timestamp] [drive count] [query string]");
    }/*from  w w  w. ja va 2  s. c  o m*/

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();
    CrawlEnvironment.setHadoopConfig(conf);

    long candidateTS = Long.parseLong(args[0]);
    int driveCount = Integer.parseInt(args[1]);
    String queryString = args[2];

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS, null);
        SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS);

        // ok hit the domain against the master index first ...
        LOG.info("Querying master index for DomainId Given DomainName:" + queryString);
        long domainId = masterIndex.queryDomainIdGivenDomain(queryString);

        LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId);
        SubDomainMetadata subDomainMeta = masterIndex.queryDomainMetadataGivenDomainId(domainId);

        if (subDomainMeta != null) {

            LOG.info("Metadata is present. Deserializing");
            // dump some fields ...
            LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount()
                    + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:"
                    + subDomainMeta.getHasPageRankCount());

            // ok time to dive into a url list ...

            // query for a list of urls sorted by name
            LOG.info("Querying for URLList for Domain BY PR");
            FlexBuffer urlListBufferByPR = slaveIndex.queryURLListSortedByPR(domainId);

            if (urlListBufferByPR != null) {

                // read the list ...
                DataInputBuffer readerStream = new DataInputBuffer();
                readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR.getCount());
                int totalItemCount = urlListBufferByPR.getCount() / 8;
                System.out.println("List BY  PR totalCount:" + totalItemCount);

                // initialize a fingerprint object to use for queries ...
                URLFPV2 queryFP = new URLFPV2();

                queryFP.setDomainHash(domainId);

                DataInputBuffer metadataReaderStream = new DataInputBuffer();
                // iterate the first N items ranked by page rank
                for (int i = 0; i < Math.min(10, totalItemCount); ++i) {

                    queryFP.setUrlHash(readerStream.readLong());

                    // and for metadata
                    MetadataOut urlMetadata = masterIndex.queryMetadataAndURLGivenFP(queryFP);

                    if (urlMetadata != null) {

                        // decode the url
                        String url = urlMetadata.url.toString();

                        System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url);
                        if (urlMetadata.datumAndMetadataBytes.getLength() == 0) {
                            System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!");
                        } else {

                            // explode metadata
                            CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();

                            metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes.getBytes(),
                                    urlMetadata.datumAndMetadataBytes.getOffset(),
                                    urlMetadata.datumAndMetadataBytes.getLength());
                            metadataObject.readFields(metadataReaderStream);

                            // ok at this point spit out stuff for this url
                            StringBuilder urlInfo = new StringBuilder();

                            urlInfo.append("    FetchStatus:"
                                    + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n");
                            urlInfo.append("    PageRank:" + metadataObject.getMetadata().getPageRank() + "\n");
                            urlInfo.append(
                                    "    ContentType:" + metadataObject.getMetadata().getContentType() + "\n");
                            urlInfo.append("    ArcFileInfoCount:"
                                    + metadataObject.getMetadata().getArchiveInfo().size());
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) {
                                urlInfo.append(
                                        "    HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo()
                                                + ":" + metadataObject.getMetadata().getLinkDBOffset());
                            }
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                urlInfo.append("    HasINVLinkDataInfo:"
                                        + metadataObject.getMetadata().getInverseDBFileNo() + ":"
                                        + metadataObject.getMetadata().getInverseDBOffset());
                            }
                            System.out.println(urlInfo.toString());

                            // now if inverse link data is present ..
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                // get it ...
                                System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash());
                                FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP,
                                        metadataObject.getMetadata().getInverseDBFileNo(),
                                        metadataObject.getMetadata().getInverseDBOffset());

                                if (inlinks != null) {
                                    System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount());
                                    FileSystem localFS = FileSystem.getLocal(conf);
                                    File testDir = new File("/tmp/dbIndexTest");
                                    File testFile = new File("/tmp/dbIndexTestFile");
                                    localFS.delete(new Path(testDir.getAbsolutePath()), true);
                                    localFS.delete(new Path(testFile.getAbsolutePath()), false);
                                    localFS.mkdirs(new Path(testDir.getAbsolutePath()));

                                    LOG.info("Creating Spill File of Inlinks");
                                    spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir,
                                            new Path(testFile.getAbsolutePath()), inlinks);
                                    LOG.info("Created Spill File of Inlinks");

                                    LOG.info("Reading Inlinks");
                                    // ok now open it up and dump the first few inlinks from the
                                    // spill file
                                    SequenceFile.Reader reader = new SequenceFile.Reader(localFS,
                                            new Path(testFile.getAbsolutePath()), conf);

                                    TextBytes key = new TextBytes();
                                    TriTextBytesTuple value = new TriTextBytesTuple();
                                    CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata();
                                    DataInputBuffer inputBuffer = new DataInputBuffer();

                                    try {
                                        int itemCount = 0;

                                        while (reader.next(key, value)) {

                                            if (value.getThirdValue().getLength() != 0) {
                                                inputBuffer.reset(value.getThirdValue().getBytes(), 0,
                                                        value.getThirdValue().getLength());
                                                metadata.readFields(inputBuffer);
                                                System.out.println("INLINK:" + key.toString()
                                                        + " METADATA STATUS:"
                                                        + CrawlDatum.getStatusName(metadata.getStatus()));
                                            } else {
                                                System.out.println("INLINK:" + key.toString() + " NOMETADATA");
                                            }

                                            if (++itemCount == 500) {
                                                break;
                                            }
                                        }
                                    } finally {
                                        reader.close();
                                    }

                                    LOG.info("Done Reding Inlinks");
                                }
                            }
                        }
                    } else {
                        LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL");
                    }
                }
            }
        }

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}