Example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer

List of usage examples for org.apache.hadoop.io DataInputBuffer DataInputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer.

Prototype

public DataInputBuffer() 

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException {

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);

    SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath,
            CrawlEnvironment.getHadoopConfig());

    ValueBytes valueBytes = indexReader.createValueBytes();
    DataOutputBuffer keyBytes = new DataOutputBuffer();
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataOutputBuffer finalOutputStream = new DataOutputBuffer();
    DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
    URLFP fp = new URLFP();

    try {//from  www.j av  a2s.co  m
        while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {

            keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
            // read fingerprint ...
            fp.readFields(keyBuffer);
            // write hash only
            finalOutputStream.writeLong(fp.getUrlHash());
            uncompressedValueBytes.reset();
            // write value bytes to intermediate buffer ...
            valueBytes.writeUncompressedBytes(uncompressedValueBytes);
            // write out uncompressed length
            WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength());
            // write out bytes
            finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength());
        }
        // delete existing ...
        cacheFilePath.delete();
        // compute crc ...
        CRC32 crc = new CRC32();
        crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
        // open final output stream
        DataOutputStream fileOutputStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(cacheFilePath)));

        try {
            fileOutputStream.writeLong(crc.getValue());
            fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength());
            fileOutputStream.flush();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            fileOutputStream.close();
            fileOutputStream = null;
            cacheFilePath.delete();
            throw e;
        } finally {
            if (fileOutputStream != null) {
                fileOutputStream.close();
            }
        }
    } finally {
        if (indexReader != null) {
            indexReader.close();
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria,
        ItemUpdater targetList) throws IOException {

    // ok copy stuff locally if possible ...
    File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
    File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
    File localBloomFilterPath = new File(getLocalDataDir(),
            CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);

    // ok copy local first
    if (!localIndexPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:"
                + localIndexPath.getAbsolutePath());
        try {//from   ww  w.j  av a2 s . com
            _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localIndexPath.delete();
            throw e;
        }
    }
    if (!localDataPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:"
                + localDataPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localDataPath.delete();
            throw e;
        }

    }
    if (!localBloomFilterPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:"
                + localBloomFilterPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localBloomFilterPath.delete();
            throw e;
        }

    }

    // ok open local
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
            new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

    try {
        URLFP firstIndexKey = null;
        URLFP lastIndexKey = new URLFP();
        LongWritable position = new LongWritable();
        while (indexReader.next(lastIndexKey, position)) {
            if (firstIndexKey == null) {
                try {
                    firstIndexKey = (URLFP) lastIndexKey.clone();
                } catch (CloneNotSupportedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:"
                + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:"
                + lastIndexKey.getUrlHash());

        URLFP criteriaFirstKey = criteria.first();
        URLFP criteriaLastKey = criteria.last();

        if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) {
            LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
            LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash()
                    + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:"
                    + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash());
            return;
        }
    } finally {
        indexReader.close();
    }

    LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan");
    // load bloom filter
    FSDataInputStream bloomFilterStream = localFileSystem
            .open(new Path(localBloomFilterPath.getAbsolutePath()));

    int hitCount = 0;

    try {
        URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);

        URLFP fpOut = new URLFP();
        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
        DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
        ValueBytes valueBytes = null;
        DataInputBuffer valueReader = new DataInputBuffer();
        DataOutputBuffer keyBytes = new DataOutputBuffer();
        DataInputBuffer keyReader = new DataInputBuffer();

        URLFP lastFP = null;

        outerLoop:
        // now iterate each item in the criteria
        for (URLFP targetFP : criteria) {
            // if fingerprint is present in filter ...
            if (filter.isPresent(targetFP)) {
                // check to see if reader is initialzied ...
                if (reader == null) {
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()),
                            CrawlEnvironment.getHadoopConfig());
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    valueBytes = reader.createValueBytes();
                }

                // if last read fingerprint was not null ...
                if (lastFP != null) {
                    // does it match the current item
                    if (lastFP.compareTo(targetFP) == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);
                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash()
                                + " File:" + dataFilePath);
                        // if so, null out last fp
                        lastFP = null;
                        // and update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;

                        continue;
                    }
                }

                // ok at this point .. read the next item in the list ...
                lastFP = null;

                while (reader.nextRaw(keyBytes, valueBytes) != -1) {
                    // init reader ...
                    keyReader.reset(keyBytes.getData(), keyBytes.getLength());
                    // read key
                    fpOut.readFields(keyReader);
                    // reset output buffer
                    keyBytes.reset();

                    // LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
                    // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
                    // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
                    // compare it to target ...
                    int result = fpOut.compareTo(targetFP);
                    // ok does it match .. ?
                    if (result == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);

                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash()
                                + " File:" + dataFilePath);
                        // update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;
                        // and break to outer loop
                        continue outerLoop;
                    } else if (result == 1) {
                        // LOG.info("LIST:" + listId +
                        // " FP Comparison Returned 1. Going to OuterLoop");
                        // update last FP
                        lastFP = fpOut;
                        // continue outer loop
                        continue outerLoop;
                    } else {
                        // otherwise skip
                    }
                }
                // ok if we got here .. we are done reading the sequence file and did
                // not find a trailing match
                LOG.warn("LIST:" + listId
                        + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
                        + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")");
                // break out of outer loop

                break;
            }
        }
    } finally {
        bloomFilterStream.close();

        if (reader != null) {
            reader.close();
        }

        LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount);
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/**
 * Initialize a new CrawlList object from a given input stream of urls 
 * /*from  w w  w.java2 s .  c o  m*/
 * @param manager           - reference to the crawl history log manager 
 * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... 
 * @throws IOException      
 */
public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval)
        throws IOException {

    _manager = manager;

    _listState = LoadState.REALLY_LOADING;

    // initialize a new list id 
    _listId = listId;

    LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath());

    //establish file names 
    initializeListFileNames();

    sourceURLFile.renameTo(_listURLDataFile);

    FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);

    try {

        // set we will use to hold all fingerprints generated 
        TreeSet<URLFP> urlSet = new TreeSet<URLFP>();

        // create temp files ...
        File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));

        // create mergesortspillwriter 
        SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(),
                new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null,
                false);

        try {

            MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                    CrawlEnvironment.getHadoopConfig(), spillwriter,
                    FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                    new Path(manager.getLocalDataDir().getAbsolutePath()), null,
                    new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {

                        DataInputBuffer _key1Buffer = new DataInputBuffer();
                        DataInputBuffer _key2Buffer = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            _key1Buffer.reset(key1Data, key1Offset, key1Length);
                            _key2Buffer.reset(key2Data, key2Offset, key2Length);

                            _key1Buffer.skip(2); // skip verison, and 1 byte id 
                            _key2Buffer.skip(2); // skip verison, and 1 byte id 

                            int domainHash1 = WritableUtils.readVInt(_key1Buffer);
                            int domainHash2 = WritableUtils.readVInt(_key2Buffer);

                            _key1Buffer.skip(1); // skip 1 byte id 
                            _key2Buffer.skip(1); // skip 1 byte id 

                            long fingerprint1 = WritableUtils.readVLong(_key1Buffer);
                            long fingerprint2 = WritableUtils.readVLong(_key2Buffer);

                            int result = ((Integer) domainHash1).compareTo(domainHash2);

                            if (result == 0) {
                                result = ((Long) fingerprint1).compareTo(fingerprint2);
                            }

                            return result;
                        }

                        @Override
                        public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2,
                                ProxyCrawlHistoryItem value2) {
                            return key1.compareTo(key2);
                        }
                    }, URLFP.class, ProxyCrawlHistoryItem.class, false, null);

            try {

                LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(urlInputStream, Charset.forName("UTF-8")));

                String line = null;
                int lineNumber = 0;
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
                while ((line = reader.readLine()) != null) {
                    ++lineNumber;
                    if (line.length() != 0 && !line.startsWith("#")) {
                        URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);

                        if (fingerprint != null) {

                            if (!urlSet.contains(fingerprint)) {
                                // and add fingerprint to set 
                                urlSet.add(fingerprint);
                                // initialize item 
                                item.clear();
                                item.setOriginalURL(line);
                                // and spill to merger / sorter .. 
                                merger.spillRecord(fingerprint, item);
                            }
                        } else {
                            LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:"
                                    + lineNumber + " URL" + line);
                        }
                    }
                }
                LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
            } finally {
                merger.close();
            }
        } finally {
            if (spillwriter != null)
                spillwriter.close();
        }
        LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
        // generate bloom filter ...  
        _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10);

        for (URLFP fingerprint : urlSet) {
            _bloomFilter.add(fingerprint);
        }
        LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
        // serialize it
        FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
        try {
            _bloomFilter.serialize(bloomFilterStream);
        } finally {
            bloomFilterStream.flush();
            bloomFilterStream.close();
        }

        LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
        // now initialize value map and string maps based on output sequence file ... 
        SequenceFile.Reader reader = new SequenceFile.Reader(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

        LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:"
                + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        // OK, Allocate room for fixed data file upfront 
        DataOutputBuffer valueStream = new DataOutputBuffer(
                urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
        LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");

        try {

            //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
            RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");

            try {
                URLFP urlFP = new URLFP();
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();

                // read fingerprints ... 
                while (reader.next(urlFP, item)) {
                    // write out fixed data structure and strings 
                    writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream);
                }
            } finally {
                //valueStream.flush();
                //valueStream.close();
                stringsStream.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");

        LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength()
                + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) {
            throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength()
                    + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        }
        // initialize temp data buffer variables 
        _tempFixedDataBuffer = valueStream.getData();
        _tempFixedDataBufferSize = valueStream.getLength();

        // update metadata 
        _metadata.setRefreshInterval(refreshInterval);
        _metadata.setUrlCount(urlSet.size());

        // setup version 
        _metadata.setVersion(1);

        // and write to disk 
        writeMetadataToDisk();

        // mark state as loaded ... 
        _listState = LoadState.LOADED;

        LOG.info("*** LIST:" + getListId() + " SYNCING");
        // reconcile with history log
        _manager.syncList(this.getListId(), urlSet, this);
        LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");

        // write metdata to disk again 
        writeMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");

        // and finally flush fixed data to disk 
        FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);

        try {
            synchronized (this) {
                int blockSize = 1 << 20;
                long bytesCopied = 0;
                for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) {
                    int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset);
                    finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
                    bytesCopied += bytesToCopy;
                }
                // validate bytes copied 
                if (bytesCopied != _tempFixedDataBufferSize) {
                    throw new IOException("Buffer Size:" + _tempFixedDataBufferSize
                            + " Does not Match BytesCopied:" + bytesCopied);
                }

                // ok release the buffer 
                _tempFixedDataBuffer = null;
                _tempFixedDataBufferSize = 0;

                LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
            }

        } finally {
            finalDataStream.flush();
            finalDataStream.close();
        }

        // load sub domain metadata from disk ... 
        loadSubDomainMetadataFromDisk();

    } catch (IOException e) {
        LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:"
                + CCStringUtils.stringifyException(e));

        _fixedDataFile.delete();
        _variableDataFile.delete();
        _bloomFilterData.delete();

        _listState = LoadState.ERROR;

        throw e;
    } finally {
        urlInputStream.close();
    }

}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException {

    // see if state is cached in memory ...
    boolean loadedFromMemory = false;

    synchronized (this) {
        if (_tempFixedDataBuffer != null) {

            loadedFromMemory = true;//from  www . ja  v  a 2 s. c om

            int low = 0;
            int high = (int) (_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1;

            OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
            DataInputBuffer inputBuffer = new DataInputBuffer();

            int iterationNumber = 0;

            while (low <= high) {

                ++iterationNumber;

                int mid = low + ((high - low) / 2);

                inputBuffer.reset(_tempFixedDataBuffer, 0, _tempFixedDataBufferSize);
                inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);

                // deserialize 
                itemOut.deserialize(inputBuffer);

                // now compare it against desired hash value ...
                int comparisonResult = itemOut.compareFingerprints(fingerprint);

                if (comparisonResult > 0)
                    high = mid - 1;
                else if (comparisonResult < 0)
                    low = mid + 1;
                else {

                    // cache offset 
                    itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

                    // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
                    // and return item 
                    return itemOut;
                }
            }
            //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");
        }
    }

    if (!loadedFromMemory) {
        //load from disk 

        //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash());

        RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw");

        // allocate buffer upfront 
        byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE];
        DataInputBuffer inputStream = new DataInputBuffer();

        //LOG.info("Opened Data File. Searching for match");
        try {

            int low = 0;
            int high = (int) (file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1;

            OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();

            int iterationNumber = 0;

            while (low <= high) {

                ++iterationNumber;

                int mid = low + ((high - low) / 2);

                // seek to proper location 
                file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                // read the data structure 
                file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length);
                // map location in file 
                //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                //DataInputStream  inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer));
                inputStream.reset(onDiskItemBuffer, 0, OnDiskCrawlHistoryItem.ON_DISK_SIZE);

                // deserialize 
                itemOut.deserialize(inputStream);

                // memoryBuffer = null;
                //inputStream = null;

                // now compare it against desired hash value ...
                int comparisonResult = itemOut.compareFingerprints(fingerprint);

                if (comparisonResult > 0)
                    high = mid - 1;
                else if (comparisonResult < 0)
                    low = mid + 1;
                else {

                    // cache offset 
                    itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

                    // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
                    // and return item 
                    return itemOut;
                }
            }
            //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");

            //DEBUG ONLY !
            // dumpFixedDataFile();
        } finally {
            file.close();
        }
    }
    return null;
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private final int getOffsetForSubDomainData(long domainHash) throws IOException {
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int low = 0;//from   w ww. jav  a  2  s  . com
    int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1;

    while (low <= high) {

        int mid = low + ((high - low) / 2);

        inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength());
        inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);

        // deserialize
        long hash = inputBuffer.readLong();

        // now compare it against desired hash value ...
        int comparisonResult = ((Long) hash).compareTo(domainHash);

        if (comparisonResult > 0)
            high = mid - 1;
        else if (comparisonResult < 0)
            low = mid + 1;
        else {
            return inputBuffer.readInt();
        }
    }
    throw new IOException("NOT-FOUND!");
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void resetSubDomainCounts() throws IOException {

    LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");

    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

        try {/*www .j  a v a 2 s.  co m*/
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
                inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                // ok reset everything except hashes and first/last url pointers 
                int urlCount = newMetadata.getUrlCount();
                long firstRecordOffset = newMetadata.getFirstRecordOffset();
                long lastRecordOffset = newMetadata.getLastRecordOffset();
                String domainName = newMetadata.getDomainName();
                long domainHash = newMetadata.getDomainHash();

                // reset 
                newMetadata.clear();
                // restore 
                newMetadata.setUrlCount(urlCount);
                newMetadata.setFirstRecordOffset(firstRecordOffset);
                newMetadata.setLastRecordOffset(lastRecordOffset);
                newMetadata.setDomainName(domainName);
                newMetadata.setDomainHash(domainHash);

                // serialize it ... 
                outputBuffer.reset();
                newMetadata.serialize(outputBuffer, new BinaryProtocol());
                // write it back to disk 
                file.seek(orignalPos);
                // and rewrite it ... 
                file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void loadSubDomainMetadataFromDisk() throws IOException {
    LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

        try {//  ww w . j  a  v a 2s . c  o m
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();
            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos);
            }

            // write lookup table 
            _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
            for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                _offsetLookupTable.writeLong(entry.getKey());
                _offsetLookupTable.writeInt(entry.getValue());
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
    } else {

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");

        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

        try {

            //ok rebuild top level metadata as well 
            _metadata.clear();

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

            int processedCount = 0;
            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                // store offset in item 
                item._fileOffset = position;
                // load from disk 
                item.deserialize(fixedDataReader);
                try {
                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();

                    // get metadata object for subdomain 
                    CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

                    // increment url count 
                    subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

                    // increment top level metadata count 
                    _metadata.setUrlCount(_metadata.getUrlCount() + 1);

                    // update top level metadata ..
                    updateMetadata(item, _metadata, 0);

                    // update sub-domain metadata object  from item data
                    updateMetadata(item, subDomainMetadata, 0);

                    ++processedCount;
                } catch (IOException e) {
                    LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
                    LOG.error("Exception:" + CCStringUtils.stringifyException(e));
                    LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                            + stringDataReader.getFilePointer());
                }

                if (processedCount % 10000 == 0) {
                    LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
                }
            }

            // ok commit top level metadata to disk as well 
            writeMetadataToDisk();

        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                    + stringDataReader.getFilePointer());
            _queueState = QueueState.QUEUED;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

        // write metadat to disk 
        writeInitialSubDomainMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) {
    synchronized (_metadata) {

        ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();

        try {//from ww  w .j a va  2 s . co  m
            synchronized (_subDomainMetadataFile) {
                RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
                DataInputBuffer inputBuffer = new DataInputBuffer();
                byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

                try {
                    // skip version 
                    file.read();
                    // read item count 
                    int itemCount = file.readInt();

                    int i = offset;
                    int end = Math.min(i + count, itemCount);

                    LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                    if (i < itemCount) {

                        file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));

                        CrawlListMetadata newMetadata = new CrawlListMetadata();

                        for (; i < end; ++i) {

                            long orignalPos = file.getFilePointer();
                            file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                            inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                            newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                            itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata));
                        }
                    }
                } finally {
                    file.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");

        return itemsOut;
    }
}

From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java

License:Open Source License

public static void distributeRank(final PRValueMap valueMap, final Path outlinksFile,
        final boolean outlinksIsRemote, File localOutputDir, String remoteOutputDir, int thisNodeIdx,
        int nodeCount, int iterationNumber, final ProgressAndCancelCheckCallback progressCallback)
        throws IOException {

    final Configuration conf = CrawlEnvironment.getHadoopConfig();

    Vector<PRValueOutputStream> outputStreamVector = new Vector<PRValueOutputStream>();

    // allocate a queue ... 
    final LinkedBlockingQueue<OutlinkItem> queue = new LinkedBlockingQueue<OutlinkItem>(20000);

    try {/* www  . j  av  a 2s . c o  m*/

        // start the loader thread ... 
        Thread loaderThread = new Thread(new Runnable() {

            final BytesWritable key = new BytesWritable();
            final BytesWritable value = new BytesWritable();

            final DataInputBuffer keyStream = new DataInputBuffer();
            final DataInputBuffer valueStream = new DataInputBuffer();

            @Override
            public void run() {
                LOG.info("Opening Outlinks File at:" + outlinksFile);
                SequenceFile.Reader reader = null;
                try {

                    FileSystem fsForOutlinksFile = null;
                    if (outlinksIsRemote) {
                        fsForOutlinksFile = CrawlEnvironment.getDefaultFileSystem();
                    } else {
                        fsForOutlinksFile = FileSystem.getLocal(conf);
                    }

                    long bytesToReadTotal = fsForOutlinksFile.getLength(outlinksFile);

                    reader = new SequenceFile.Reader(fsForOutlinksFile, outlinksFile, conf);
                    OutlinkItem item = new OutlinkItem();
                    int itemCount = 0;
                    boolean isCancelled = false;
                    while (!isCancelled && reader.next(key, value)) {

                        keyStream.reset(key.get(), 0, key.getLength());
                        valueStream.reset(value.get(), 0, value.getLength());

                        //populate item from data 
                        readURLFPFromStream(keyStream, item.targetFingerprint);
                        item.urlCount = readURLFPAndCountFromStream(valueStream, item.sourceFingerprint);

                        try {
                            long blockTimeStart = System.currentTimeMillis();
                            queue.put(item);
                            long blockTimeEnd = System.currentTimeMillis();
                        } catch (InterruptedException e) {
                        }
                        item = new OutlinkItem();

                        if (itemCount++ % 10000 == 0 && progressCallback != null) {

                            float percentComplete = (float) reader.getPosition() / (float) bytesToReadTotal;
                            if (progressCallback.updateProgress(percentComplete)) {
                                LOG.info("Cancel check callback returned true.Cancelling outlink item load");
                                isCancelled = true;
                            }
                        }
                    }
                    item.sourceFingerprint = null;
                    item.targetFingerprint = null;

                    // add empty item 
                    try {
                        if (!isCancelled) {
                            queue.put(item);
                        } else {
                            queue.put(new OutlinkItem(new IOException("Operation Cancelled")));
                        }
                    } catch (InterruptedException e) {
                    }

                } catch (IOException e) {
                    // add error item to queue.
                    try {
                        queue.put(new OutlinkItem(e));
                    } catch (InterruptedException e1) {
                    }
                } finally {
                    if (reader != null)
                        try {
                            reader.close();
                        } catch (IOException e) {
                        }
                }
            }

        });

        loaderThread.start();

        // first things first ... initialize output stream vector
        FileSystem fileSystem = buildDistributionOutputStreamVector(true,
                getOutlinksBaseName(thisNodeIdx, iterationNumber), localOutputDir, remoteOutputDir, thisNodeIdx,
                nodeCount, outputStreamVector);

        try {
            // open outlinks file .
            LOG.info("Iterating Items in Outlinks File and Writing Test Value");

            int itemCount = 0;
            int totalOutlinkCount = 0;
            int iterationOutlinkCount = 0;
            long iterationStart = System.currentTimeMillis();
            long timeStart = iterationStart;

            boolean done = false;

            ArrayList<OutlinkItem> items = new ArrayList<OutlinkItem>();
            // start iterating outlinks 
            while (!done) {

                //OutlinkItem item = null;

                //try {
                long waitTimeStart = System.currentTimeMillis();
                queue.drainTo(items);
                long waitTimeEnd = System.currentTimeMillis();
                //} catch (InterruptedException e) {
                //}

                for (OutlinkItem item : items) {
                    if (item.error != null) {
                        LOG.info(
                                "Loader Thread Returned Error:" + CCStringUtils.stringifyException(item.error));
                        throw item.error;
                    } else if (item.sourceFingerprint == null) {
                        LOG.info("Loader Thread Indicated EOF via emtpy item");
                        done = true;
                    } else {
                        ++itemCount;

                        /*
                        LOG.info("SourceFP-DomainHash:" + item.sourceFingerprint.getDomainHash() + " URLHash:" + item.sourceFingerprint.getUrlHash() 
                              + " PartitionIdx:" + ((item.sourceFingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES) );
                        */

                        // now get pr value for fingerprint (random seek in memory here!!!)
                        float prValue = valueMap.getPRValue(item.sourceFingerprint)
                                / (float) Math.max(item.urlCount, 1);

                        // write value out 
                        int nodeIndex = (item.targetFingerprint.hashCode() & Integer.MAX_VALUE) % nodeCount;
                        outputStreamVector.get(nodeIndex).writePRValue(item.targetFingerprint,
                                item.sourceFingerprint, prValue);

                        if (itemCount % 10000 == 0) {

                            long timeEnd = System.currentTimeMillis();
                            int milliseconds = (int) (timeEnd - iterationStart);

                            LOG.info("Distribute PR for 10000 Items with:" + iterationOutlinkCount
                                    + " Outlinks Took:" + milliseconds + " Milliseconds" + " QueueCount:"
                                    + queue.size());

                            iterationStart = System.currentTimeMillis();
                            totalOutlinkCount += iterationOutlinkCount;
                            iterationOutlinkCount = 0;
                        }

                    }
                }
                items.clear();
            }

            totalOutlinkCount += iterationOutlinkCount;

            LOG.info("Distribute Finished for a total of:" + itemCount + " Items with:" + totalOutlinkCount
                    + " Outlinks Took:" + (System.currentTimeMillis() - timeStart) + " Milliseconds");

            LOG.info("Waiting for Loader Thread to Die");
            try {
                loaderThread.join();
            } catch (InterruptedException e) {
            }
            LOG.info("Loader Thread Died - Moving on...");
        } finally {

            for (PRValueOutputStream info : outputStreamVector) {

                if (info != null) {
                    info.close(false);
                }
            }

            if (fileSystem != null) {
                fileSystem.close();
            }
        }
    } catch (IOException e) {
        LOG.error("Exception caught while distributing outlinks:" + CCStringUtils.stringifyException(e));
        throw e;
    }
}

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

private static void spillLinkDataIntoTempFileIndex(FileSystem remoteFileSystem, FileSystem localFileSystem,
        Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, File tempFilePath, Path outputFilePath,
        FlexBuffer linkData) throws IOException {

    SequenceFileSpillWriter<TextBytes, TriTextBytesTuple> outputWriter = new SequenceFileSpillWriter<TextBytes, TriTextBytesTuple>(
            localFileSystem, conf, outputFilePath, TextBytes.class, TriTextBytesTuple.class,
            new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                    PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFilePath)),
            true);//from  www  .  ja  v a 2s .c om

    try {
        // ok create merge sort spill writer ...
        MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger = new MergeSortSpillWriter<TextBytes, TriTextBytesTuple>(
                conf, outputWriter, localFileSystem, new Path(tempFilePath.getAbsolutePath()), null,
                new RawKeyValueComparator<TextBytes, TriTextBytesTuple>() {

                    DataInputBuffer stream1 = new DataInputBuffer();
                    DataInputBuffer stream2 = new DataInputBuffer();
                    TriTextBytesTuple tuple1 = new TriTextBytesTuple();
                    TriTextBytesTuple tuple2 = new TriTextBytesTuple();

                    @Override
                    public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                            int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                            int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                            throws IOException {

                        stream1.reset(value1Data, value1Offset, value1Length);
                        stream2.reset(value2Data, value2Offset, value2Length);

                        // ok skip url
                        int url1Length = WritableUtils.readVInt(stream1);
                        stream1.skip(url1Length);
                        int url2Length = WritableUtils.readVInt(stream2);
                        stream2.skip(url2Length);
                        // ok now read optimized page rank stuffed in second tuple
                        WritableUtils.readVInt(stream1);
                        WritableUtils.readVInt(stream2);
                        // now read page rank
                        float pageRank1 = stream1.readFloat();
                        float pageRank2 = stream2.readFloat();

                        return (pageRank1 == pageRank2) ? 0 : (pageRank1 < pageRank2) ? -1 : 1;

                    }

                    @Override
                    public int compare(TextBytes key1, TriTextBytesTuple value1, TextBytes key2,
                            TriTextBytesTuple value2) {
                        stream1.reset(value1.getSecondValue().getBytes(), value1.getSecondValue().getLength());
                        stream2.reset(value2.getSecondValue().getBytes(), value2.getSecondValue().getLength());

                        try {
                            float pr1 = stream1.readFloat();
                            float pr2 = stream2.readFloat();

                            return (pr1 == pr2) ? 0 : pr1 < pr2 ? -1 : 1;

                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                            throw new RuntimeException();
                        }
                    }
                }, TextBytes.class, TriTextBytesTuple.class, false, null);

        try {
            long timeStart = System.currentTimeMillis();
            System.out.println(".Running Merger against to resolve tuple set ");
            index.bulkQueryURLAndMetadataGivenInputStream(remoteFileSystem, conf, tempFilePath, linkData,
                    merger);
            long timeEnd = System.currentTimeMillis();
            LOG.info(".Merged Successfully in:" + (timeEnd - timeStart));
        } finally {
            LOG.info("Closing Merger");
            merger.close();
        }
    } finally {
        LOG.info("Closing Writer");
        outputWriter.close();
    }
}