Example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer.

Prototype

public DataInputBuffer()

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException {

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);

    SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath,
            CrawlEnvironment.getHadoopConfig());

    ValueBytes valueBytes = indexReader.createValueBytes();
    DataOutputBuffer keyBytes = new DataOutputBuffer();
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataOutputBuffer finalOutputStream = new DataOutputBuffer();
    DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
    URLFP fp = new URLFP();

    try {//from  www.j av  a2s.co  m
        while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {

            keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
            // read fingerprint ...
            fp.readFields(keyBuffer);
            // write hash only
            finalOutputStream.writeLong(fp.getUrlHash());
            uncompressedValueBytes.reset();
            // write value bytes to intermediate buffer ...
            valueBytes.writeUncompressedBytes(uncompressedValueBytes);
            // write out uncompressed length
            WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength());
            // write out bytes
            finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength());
        }
        // delete existing ...
        cacheFilePath.delete();
        // compute crc ...
        CRC32 crc = new CRC32();
        crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
        // open final output stream
        DataOutputStream fileOutputStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(cacheFilePath)));

        try {
            fileOutputStream.writeLong(crc.getValue());
            fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength());
            fileOutputStream.flush();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            fileOutputStream.close();
            fileOutputStream = null;
            cacheFilePath.delete();
            throw e;
        } finally {
            if (fileOutputStream != null) {
                fileOutputStream.close();
            }
        }
    } finally {
        if (indexReader != null) {
            indexReader.close();
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria,
        ItemUpdater targetList) throws IOException {

    // ok copy stuff locally if possible ...
    File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
    File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
    File localBloomFilterPath = new File(getLocalDataDir(),
            CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);

    // ok copy local first
    if (!localIndexPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:"
                + localIndexPath.getAbsolutePath());
        try {//from   ww  w.j  av a2 s . com
            _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localIndexPath.delete();
            throw e;
        }
    }
    if (!localDataPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:"
                + localDataPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localDataPath.delete();
            throw e;
        }

    }
    if (!localBloomFilterPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:"
                + localBloomFilterPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localBloomFilterPath.delete();
            throw e;
        }

    }

    // ok open local
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
            new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

    try {
        URLFP firstIndexKey = null;
        URLFP lastIndexKey = new URLFP();
        LongWritable position = new LongWritable();
        while (indexReader.next(lastIndexKey, position)) {
            if (firstIndexKey == null) {
                try {
                    firstIndexKey = (URLFP) lastIndexKey.clone();
                } catch (CloneNotSupportedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:"
                + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:"
                + lastIndexKey.getUrlHash());

        URLFP criteriaFirstKey = criteria.first();
        URLFP criteriaLastKey = criteria.last();

        if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) {
            LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
            LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash()
                    + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:"
                    + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash());
            return;
        }
    } finally {
        indexReader.close();
    }

    LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan");
    // load bloom filter
    FSDataInputStream bloomFilterStream = localFileSystem
            .open(new Path(localBloomFilterPath.getAbsolutePath()));

    int hitCount = 0;

    try {
        URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);

        URLFP fpOut = new URLFP();
        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
        DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
        ValueBytes valueBytes = null;
        DataInputBuffer valueReader = new DataInputBuffer();
        DataOutputBuffer keyBytes = new DataOutputBuffer();
        DataInputBuffer keyReader = new DataInputBuffer();

        URLFP lastFP = null;

        outerLoop:
        // now iterate each item in the criteria
        for (URLFP targetFP : criteria) {
            // if fingerprint is present in filter ...
            if (filter.isPresent(targetFP)) {
                // check to see if reader is initialzied ...
                if (reader == null) {
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()),
                            CrawlEnvironment.getHadoopConfig());
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    valueBytes = reader.createValueBytes();
                }

                // if last read fingerprint was not null ...
                if (lastFP != null) {
                    // does it match the current item
                    if (lastFP.compareTo(targetFP) == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);
                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash()
                                + " File:" + dataFilePath);
                        // if so, null out last fp
                        lastFP = null;
                        // and update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;

                        continue;
                    }
                }

                // ok at this point .. read the next item in the list ...
                lastFP = null;

                while (reader.nextRaw(keyBytes, valueBytes) != -1) {
                    // init reader ...
                    keyReader.reset(keyBytes.getData(), keyBytes.getLength());
                    // read key
                    fpOut.readFields(keyReader);
                    // reset output buffer
                    keyBytes.reset();

                    // LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
                    // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
                    // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
                    // compare it to target ...
                    int result = fpOut.compareTo(targetFP);
                    // ok does it match .. ?
                    if (result == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);

                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash()
                                + " File:" + dataFilePath);
                        // update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;
                        // and break to outer loop
                        continue outerLoop;
                    } else if (result == 1) {
                        // LOG.info("LIST:" + listId +
                        // " FP Comparison Returned 1. Going to OuterLoop");
                        // update last FP
                        lastFP = fpOut;
                        // continue outer loop
                        continue outerLoop;
                    } else {
                        // otherwise skip
                    }
                }
                // ok if we got here .. we are done reading the sequence file and did
                // not find a trailing match
                LOG.warn("LIST:" + listId
                        + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
                        + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")");
                // break out of outer loop

                break;
            }
        }
    } finally {
        bloomFilterStream.close();

        if (reader != null) {
            reader.close();
        }

        LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount);
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/**
 * Initialize a new CrawlList object from a given input stream of urls 
 * /*from  w w  w.java2 s .  c o  m*/
 * @param manager           - reference to the crawl history log manager 
 * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... 
 * @throws IOException      
 */
public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval)
        throws IOException {

    _manager = manager;

    _listState = LoadState.REALLY_LOADING;

    // initialize a new list id 
    _listId = listId;

    LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath());

    //establish file names 
    initializeListFileNames();

    sourceURLFile.renameTo(_listURLDataFile);

    FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);

    try {

        // set we will use to hold all fingerprints generated 
        TreeSet<URLFP> urlSet = new TreeSet<URLFP>();

        // create temp files ...
        File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));

        // create mergesortspillwriter 
        SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(),
                new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null,
                false);

        try {

            MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                    CrawlEnvironment.getHadoopConfig(), spillwriter,
                    FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                    new Path(manager.getLocalDataDir().getAbsolutePath()), null,
                    new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {

                        DataInputBuffer _key1Buffer = new DataInputBuffer();
                        DataInputBuffer _key2Buffer = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            _key1Buffer.reset(key1Data, key1Offset, key1Length);
                            _key2Buffer.reset(key2Data, key2Offset, key2Length);

                            _key1Buffer.skip(2); // skip verison, and 1 byte id 
                            _key2Buffer.skip(2); // skip verison, and 1 byte id 

                            int domainHash1 = WritableUtils.readVInt(_key1Buffer);
                            int domainHash2 = WritableUtils.readVInt(_key2Buffer);

                            _key1Buffer.skip(1); // skip 1 byte id 
                            _key2Buffer.skip(1); // skip 1 byte id 

                            long fingerprint1 = WritableUtils.readVLong(_key1Buffer);
                            long fingerprint2 = WritableUtils.readVLong(_key2Buffer);

                            int result = ((Integer) domainHash1).compareTo(domainHash2);

                            if (result == 0) {
                                result = ((Long) fingerprint1).compareTo(fingerprint2);
                            }

                            return result;
                        }

                        @Override
                        public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2,
                                ProxyCrawlHistoryItem value2) {
                            return key1.compareTo(key2);
                        }
                    }, URLFP.class, ProxyCrawlHistoryItem.class, false, null);

            try {

                LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(urlInputStream, Charset.forName("UTF-8")));

                String line = null;
                int lineNumber = 0;
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
                while ((line = reader.readLine()) != null) {
                    ++lineNumber;
                    if (line.length() != 0 && !line.startsWith("#")) {
                        URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);

                        if (fingerprint != null) {

                            if (!urlSet.contains(fingerprint)) {
                                // and add fingerprint to set 
                                urlSet.add(fingerprint);
                                // initialize item 
                                item.clear();
                                item.setOriginalURL(line);
                                // and spill to merger / sorter .. 
                                merger.spillRecord(fingerprint, item);
                            }
                        } else {
                            LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:"
                                    + lineNumber + " URL" + line);
                        }
                    }
                }
                LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
            } finally {
                merger.close();
            }
        } finally {
            if (spillwriter != null)
                spillwriter.close();
        }
        LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
        // generate bloom filter ...  
        _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10);

        for (URLFP fingerprint : urlSet) {
            _bloomFilter.add(fingerprint);
        }
        LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
        // serialize it
        FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
        try {
            _bloomFilter.serialize(bloomFilterStream);
        } finally {
            bloomFilterStream.flush();
            bloomFilterStream.close();
        }

        LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
        // now initialize value map and string maps based on output sequence file ... 
        SequenceFile.Reader reader = new SequenceFile.Reader(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

        LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:"
                + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        // OK, Allocate room for fixed data file upfront 
        DataOutputBuffer valueStream = new DataOutputBuffer(
                urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
        LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");

        try {

            //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
            RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");

            try {
                URLFP urlFP = new URLFP();
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();

                // read fingerprints ... 
                while (reader.next(urlFP, item)) {
                    // write out fixed data structure and strings 
                    writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream);
                }
            } finally {
                //valueStream.flush();
                //valueStream.close();
                stringsStream.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");

        LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength()
                + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) {
            throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength()
                    + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        }
        // initialize temp data buffer variables 
        _tempFixedDataBuffer = valueStream.getData();
        _tempFixedDataBufferSize = valueStream.getLength();

        // update metadata 
        _metadata.setRefreshInterval(refreshInterval);
        _metadata.setUrlCount(urlSet.size());

        // setup version 
        _metadata.setVersion(1);

        // and write to disk 
        writeMetadataToDisk();

        // mark state as loaded ... 
        _listState = LoadState.LOADED;

        LOG.info("*** LIST:" + getListId() + " SYNCING");
        // reconcile with history log
        _manager.syncList(this.getListId(), urlSet, this);
        LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");

        // write metdata to disk again 
        writeMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");

        // and finally flush fixed data to disk 
        FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);

        try {
            synchronized (this) {
                int blockSize = 1 << 20;
                long bytesCopied = 0;
                for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) {
                    int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset);
                    finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
                    bytesCopied += bytesToCopy;
                }
                // validate bytes copied 
                if (bytesCopied != _tempFixedDataBufferSize) {
                    throw new IOException("Buffer Size:" + _tempFixedDataBufferSize
                            + " Does not Match BytesCopied:" + bytesCopied);
                }

                // ok release the buffer 
                _tempFixedDataBuffer = null;
                _tempFixedDataBufferSize = 0;

                LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
            }

        } finally {
            finalDataStream.flush();
            finalDataStream.close();
        }

        // load sub domain metadata from disk ... 
        loadSubDomainMetadataFromDisk();

    } catch (IOException e) {
        LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:"
                + CCStringUtils.stringifyException(e));

        _fixedDataFile.delete();
        _variableDataFile.delete();
        _bloomFilterData.delete();

        _listState = LoadState.ERROR;

        throw e;
    } finally {
        urlInputStream.close();
    }

}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException {

    // see if state is cached in memory ...
    boolean loadedFromMemory = false;

    synchronized (this) {
        if (_tempFixedDataBuffer != null) {

            loadedFromMemory = true;//from  www . ja  v  a 2 s. c om

            int low = 0;
            int high = (int) (_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1;

            OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();
            DataInputBuffer inputBuffer = new DataInputBuffer();

            int iterationNumber = 0;

            while (low <= high) {

                ++iterationNumber;

                int mid = low + ((high - low) / 2);

                inputBuffer.reset(_tempFixedDataBuffer, 0, _tempFixedDataBufferSize);
                inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);

                // deserialize 
                itemOut.deserialize(inputBuffer);

                // now compare it against desired hash value ...
                int comparisonResult = itemOut.compareFingerprints(fingerprint);

                if (comparisonResult > 0)
                    high = mid - 1;
                else if (comparisonResult < 0)
                    low = mid + 1;
                else {

                    // cache offset 
                    itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

                    // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
                    // and return item 
                    return itemOut;
                }
            }
            //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");
        }
    }

    if (!loadedFromMemory) {
        //load from disk 

        //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash());

        RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw");

        // allocate buffer upfront 
        byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE];
        DataInputBuffer inputStream = new DataInputBuffer();

        //LOG.info("Opened Data File. Searching for match");
        try {

            int low = 0;
            int high = (int) (file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1;

            OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem();

            int iterationNumber = 0;

            while (low <= high) {

                ++iterationNumber;

                int mid = low + ((high - low) / 2);

                // seek to proper location 
                file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                // read the data structure 
                file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length);
                // map location in file 
                //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE);
                //DataInputStream  inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer));
                inputStream.reset(onDiskItemBuffer, 0, OnDiskCrawlHistoryItem.ON_DISK_SIZE);

                // deserialize 
                itemOut.deserialize(inputStream);

                // memoryBuffer = null;
                //inputStream = null;

                // now compare it against desired hash value ...
                int comparisonResult = itemOut.compareFingerprints(fingerprint);

                if (comparisonResult > 0)
                    high = mid - 1;
                else if (comparisonResult < 0)
                    low = mid + 1;
                else {

                    // cache offset 
                    itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE;

                    // LOG.info("Found Match. Took:"+ iterationNumber + " iterations");
                    // and return item 
                    return itemOut;
                }
            }
            //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations");

            //DEBUG ONLY !
            // dumpFixedDataFile();
        } finally {
            file.close();
        }
    }
    return null;
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private final int getOffsetForSubDomainData(long domainHash) throws IOException {
    DataInputBuffer inputBuffer = new DataInputBuffer();

    int low = 0;//from   w ww. jav  a  2  s  . com
    int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1;

    while (low <= high) {

        int mid = low + ((high - low) / 2);

        inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength());
        inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE);

        // deserialize
        long hash = inputBuffer.readLong();

        // now compare it against desired hash value ...
        int comparisonResult = ((Long) hash).compareTo(domainHash);

        if (comparisonResult > 0)
            high = mid - 1;
        else if (comparisonResult < 0)
            low = mid + 1;
        else {
            return inputBuffer.readInt();
        }
    }
    throw new IOException("NOT-FOUND!");
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void resetSubDomainCounts() throws IOException {

    LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts.");

    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS .");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

        try {/*www .j  a v a 2 s.  co m*/
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
                inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                // ok reset everything except hashes and first/last url pointers 
                int urlCount = newMetadata.getUrlCount();
                long firstRecordOffset = newMetadata.getFirstRecordOffset();
                long lastRecordOffset = newMetadata.getLastRecordOffset();
                String domainName = newMetadata.getDomainName();
                long domainHash = newMetadata.getDomainHash();

                // reset 
                newMetadata.clear();
                // restore 
                newMetadata.setUrlCount(urlCount);
                newMetadata.setFirstRecordOffset(firstRecordOffset);
                newMetadata.setLastRecordOffset(lastRecordOffset);
                newMetadata.setDomainName(domainName);
                newMetadata.setDomainHash(domainHash);

                // serialize it ... 
                outputBuffer.reset();
                newMetadata.serialize(outputBuffer, new BinaryProtocol());
                // write it back to disk 
                file.seek(orignalPos);
                // and rewrite it ... 
                file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void loadSubDomainMetadataFromDisk() throws IOException {
    LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ...  ");
    if (_subDomainMetadataFile.exists()) {

        LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK.");

        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        DataInputBuffer inputBuffer = new DataInputBuffer();
        byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

        try {//  ww w . j  a  v a 2s . c  o m
            // skip version 
            file.read();
            // read item count 
            int itemCount = file.readInt();

            LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

            CrawlListMetadata newMetadata = new CrawlListMetadata();

            TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();
            for (int i = 0; i < itemCount; ++i) {

                long orignalPos = file.getFilePointer();
                file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                try {
                    newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                } catch (Exception e) {
                    LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:"
                            + CCStringUtils.stringifyException(e));
                }
                idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos);
            }

            // write lookup table 
            _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);
            for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
                _offsetLookupTable.writeLong(entry.getKey());
                _offsetLookupTable.writeInt(entry.getValue());
            }
        } finally {
            file.close();
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");
    } else {

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH");

        RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw");
        RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw");

        try {

            //ok rebuild top level metadata as well 
            _metadata.clear();

            OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem();

            int processedCount = 0;
            while (fixedDataReader.getFilePointer() != fixedDataReader.length()) {

                long position = fixedDataReader.getFilePointer();

                // store offset in item 
                item._fileOffset = position;
                // load from disk 
                item.deserialize(fixedDataReader);
                try {
                    // seek to string data 
                    stringDataReader.seek(item._stringsOffset);
                    // and skip buffer length 
                    WritableUtils.readVInt(stringDataReader);
                    // and read primary string 
                    String url = stringDataReader.readUTF();

                    // get metadata object for subdomain 
                    CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url);

                    // increment url count 
                    subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1);

                    // increment top level metadata count 
                    _metadata.setUrlCount(_metadata.getUrlCount() + 1);

                    // update top level metadata ..
                    updateMetadata(item, _metadata, 0);

                    // update sub-domain metadata object  from item data
                    updateMetadata(item, subDomainMetadata, 0);

                    ++processedCount;
                } catch (IOException e) {
                    LOG.error("Exception Reading String Data For Item:" + (processedCount + 1));
                    LOG.error("Exception:" + CCStringUtils.stringifyException(e));
                    LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                            + stringDataReader.getFilePointer());
                }

                if (processedCount % 10000 == 0) {
                    LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items");
                }
            }

            // ok commit top level metadata to disk as well 
            writeMetadataToDisk();

        } catch (IOException e) {
            LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:"
                    + CCStringUtils.stringifyException(e));
            LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:"
                    + stringDataReader.getFilePointer());
            _queueState = QueueState.QUEUED;
        } finally {
            fixedDataReader.close();
            stringDataReader.close();
        }
        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK");

        // write metadat to disk 
        writeInitialSubDomainMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE");
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) {
    synchronized (_metadata) {

        ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>();

        try {//from ww  w .j a va  2 s . co  m
            synchronized (_subDomainMetadataFile) {
                RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
                DataInputBuffer inputBuffer = new DataInputBuffer();
                byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize];

                try {
                    // skip version 
                    file.read();
                    // read item count 
                    int itemCount = file.readInt();

                    int i = offset;
                    int end = Math.min(i + count, itemCount);

                    LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount);

                    if (i < itemCount) {

                        file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset));

                        CrawlListMetadata newMetadata = new CrawlListMetadata();

                        for (; i < end; ++i) {

                            long orignalPos = file.getFilePointer();
                            file.readFully(fixedDataBlock, 0, fixedDataBlock.length);
                            inputBuffer.reset(fixedDataBlock, fixedDataBlock.length);
                            newMetadata.deserialize(inputBuffer, new BinaryProtocol());
                            itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata));
                        }
                    }
                } finally {
                    file.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
        }
        LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK");

        return itemsOut;
    }
}

From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java

License:Open Source License

public static void distributeRank(final PRValueMap valueMap, final Path outlinksFile,
        final boolean outlinksIsRemote, File localOutputDir, String remoteOutputDir, int thisNodeIdx,
        int nodeCount, int iterationNumber, final ProgressAndCancelCheckCallback progressCallback)
        throws IOException {

    final Configuration conf = CrawlEnvironment.getHadoopConfig();

    Vector<PRValueOutputStream> outputStreamVector = new Vector<PRValueOutputStream>();

    // allocate a queue ... 
    final LinkedBlockingQueue<OutlinkItem> queue = new LinkedBlockingQueue<OutlinkItem>(20000);

    try {/* www  . j  av  a 2s . c o  m*/

        // start the loader thread ... 
        Thread loaderThread = new Thread(new Runnable() {

            final BytesWritable key = new BytesWritable();
            final BytesWritable value = new BytesWritable();

            final DataInputBuffer keyStream = new DataInputBuffer();
            final DataInputBuffer valueStream = new DataInputBuffer();

            @Override
            public void run() {
                LOG.info("Opening Outlinks File at:" + outlinksFile);
                SequenceFile.Reader reader = null;
                try {

                    FileSystem fsForOutlinksFile = null;
                    if (outlinksIsRemote) {
                        fsForOutlinksFile = CrawlEnvironment.getDefaultFileSystem();
                    } else {
                        fsForOutlinksFile = FileSystem.getLocal(conf);
                    }

                    long bytesToReadTotal = fsForOutlinksFile.getLength(outlinksFile);

                    reader = new SequenceFile.Reader(fsForOutlinksFile, outlinksFile, conf);
                    OutlinkItem item = new OutlinkItem();
                    int itemCount = 0;
                    boolean isCancelled = false;
                    while (!isCancelled && reader.next(key, value)) {

                        keyStream.reset(key.get(), 0, key.getLength());
                        valueStream.reset(value.get(), 0, value.getLength());

                        //populate item from data 
                        readURLFPFromStream(keyStream, item.targetFingerprint);
                        item.urlCount = readURLFPAndCountFromStream(valueStream, item.sourceFingerprint);

                        try {
                            long blockTimeStart = System.currentTimeMillis();
                            queue.put(item);
                            long blockTimeEnd = System.currentTimeMillis();
                        } catch (InterruptedException e) {
                        }
                        item = new OutlinkItem();

                        if (itemCount++ % 10000 == 0 && progressCallback != null) {

                            float percentComplete = (float) reader.getPosition() / (float) bytesToReadTotal;
                            if (progressCallback.updateProgress(percentComplete)) {
                                LOG.info("Cancel check callback returned true.Cancelling outlink item load");
                                isCancelled = true;
                            }
                        }
                    }
                    item.sourceFingerprint = null;
                    item.targetFingerprint = null;

                    // add empty item 
                    try {
                        if (!isCancelled) {
                            queue.put(item);
                        } else {
                            queue.put(new OutlinkItem(new IOException("Operation Cancelled")));
                        }
                    } catch (InterruptedException e) {
                    }

                } catch (IOException e) {
                    // add error item to queue.
                    try {
                        queue.put(new OutlinkItem(e));
                    } catch (InterruptedException e1) {
                    }
                } finally {
                    if (reader != null)
                        try {
                            reader.close();
                        } catch (IOException e) {
                        }
                }
            }

        });

        loaderThread.start();

        // first things first ... initialize output stream vector
        FileSystem fileSystem = buildDistributionOutputStreamVector(true,
                getOutlinksBaseName(thisNodeIdx, iterationNumber), localOutputDir, remoteOutputDir, thisNodeIdx,
                nodeCount, outputStreamVector);

        try {
            // open outlinks file .
            LOG.info("Iterating Items in Outlinks File and Writing Test Value");

            int itemCount = 0;
            int totalOutlinkCount = 0;
            int iterationOutlinkCount = 0;
            long iterationStart = System.currentTimeMillis();
            long timeStart = iterationStart;

            boolean done = false;

            ArrayList<OutlinkItem> items = new ArrayList<OutlinkItem>();
            // start iterating outlinks 
            while (!done) {

                //OutlinkItem item = null;

                //try {
                long waitTimeStart = System.currentTimeMillis();
                queue.drainTo(items);
                long waitTimeEnd = System.currentTimeMillis();
                //} catch (InterruptedException e) {
                //}

                for (OutlinkItem item : items) {
                    if (item.error != null) {
                        LOG.info(
                                "Loader Thread Returned Error:" + CCStringUtils.stringifyException(item.error));
                        throw item.error;
                    } else if (item.sourceFingerprint == null) {
                        LOG.info("Loader Thread Indicated EOF via emtpy item");
                        done = true;
                    } else {
                        ++itemCount;

                        /*
                        LOG.info("SourceFP-DomainHash:" + item.sourceFingerprint.getDomainHash() + " URLHash:" + item.sourceFingerprint.getUrlHash() 
                              + " PartitionIdx:" + ((item.sourceFingerprint.hashCode() & Integer.MAX_VALUE) % CrawlEnvironment.PR_NUMSLAVES) );
                        */

                        // now get pr value for fingerprint (random seek in memory here!!!)
                        float prValue = valueMap.getPRValue(item.sourceFingerprint)
                                / (float) Math.max(item.urlCount, 1);

                        // write value out 
                        int nodeIndex = (item.targetFingerprint.hashCode() & Integer.MAX_VALUE) % nodeCount;
                        outputStreamVector.get(nodeIndex).writePRValue(item.targetFingerprint,
                                item.sourceFingerprint, prValue);

                        if (itemCount % 10000 == 0) {

                            long timeEnd = System.currentTimeMillis();
                            int milliseconds = (int) (timeEnd - iterationStart);

                            LOG.info("Distribute PR for 10000 Items with:" + iterationOutlinkCount
                                    + " Outlinks Took:" + milliseconds + " Milliseconds" + " QueueCount:"
                                    + queue.size());

                            iterationStart = System.currentTimeMillis();
                            totalOutlinkCount += iterationOutlinkCount;
                            iterationOutlinkCount = 0;
                        }

                    }
                }
                items.clear();
            }

            totalOutlinkCount += iterationOutlinkCount;

            LOG.info("Distribute Finished for a total of:" + itemCount + " Items with:" + totalOutlinkCount
                    + " Outlinks Took:" + (System.currentTimeMillis() - timeStart) + " Milliseconds");

            LOG.info("Waiting for Loader Thread to Die");
            try {
                loaderThread.join();
            } catch (InterruptedException e) {
            }
            LOG.info("Loader Thread Died - Moving on...");
        } finally {

            for (PRValueOutputStream info : outputStreamVector) {

                if (info != null) {
                    info.close(false);
                }
            }

            if (fileSystem != null) {
                fileSystem.close();
            }
        }
    } catch (IOException e) {
        LOG.error("Exception caught while distributing outlinks:" + CCStringUtils.stringifyException(e));
        throw e;
    }
}

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

private static void spillLinkDataIntoTempFileIndex(FileSystem remoteFileSystem, FileSystem localFileSystem,
        Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, File tempFilePath, Path outputFilePath,
        FlexBuffer linkData) throws IOException {

    SequenceFileSpillWriter<TextBytes, TriTextBytesTuple> outputWriter = new SequenceFileSpillWriter<TextBytes, TriTextBytesTuple>(
            localFileSystem, conf, outputFilePath, TextBytes.class, TriTextBytesTuple.class,
            new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                    PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFilePath)),
            true);//from  www  .  ja  v a 2s .c om

    try {
        // ok create merge sort spill writer ...
        MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger = new MergeSortSpillWriter<TextBytes, TriTextBytesTuple>(
                conf, outputWriter, localFileSystem, new Path(tempFilePath.getAbsolutePath()), null,
                new RawKeyValueComparator<TextBytes, TriTextBytesTuple>() {

                    DataInputBuffer stream1 = new DataInputBuffer();
                    DataInputBuffer stream2 = new DataInputBuffer();
                    TriTextBytesTuple tuple1 = new TriTextBytesTuple();
                    TriTextBytesTuple tuple2 = new TriTextBytesTuple();

                    @Override
                    public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                            int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                            int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                            throws IOException {

                        stream1.reset(value1Data, value1Offset, value1Length);
                        stream2.reset(value2Data, value2Offset, value2Length);

                        // ok skip url
                        int url1Length = WritableUtils.readVInt(stream1);
                        stream1.skip(url1Length);
                        int url2Length = WritableUtils.readVInt(stream2);
                        stream2.skip(url2Length);
                        // ok now read optimized page rank stuffed in second tuple
                        WritableUtils.readVInt(stream1);
                        WritableUtils.readVInt(stream2);
                        // now read page rank
                        float pageRank1 = stream1.readFloat();
                        float pageRank2 = stream2.readFloat();

                        return (pageRank1 == pageRank2) ? 0 : (pageRank1 < pageRank2) ? -1 : 1;

                    }

                    @Override
                    public int compare(TextBytes key1, TriTextBytesTuple value1, TextBytes key2,
                            TriTextBytesTuple value2) {
                        stream1.reset(value1.getSecondValue().getBytes(), value1.getSecondValue().getLength());
                        stream2.reset(value2.getSecondValue().getBytes(), value2.getSecondValue().getLength());

                        try {
                            float pr1 = stream1.readFloat();
                            float pr2 = stream2.readFloat();

                            return (pr1 == pr2) ? 0 : pr1 < pr2 ? -1 : 1;

                        } catch (IOException e) {
                            LOG.error(CCStringUtils.stringifyException(e));
                            throw new RuntimeException();
                        }
                    }
                }, TextBytes.class, TriTextBytesTuple.class, false, null);

        try {
            long timeStart = System.currentTimeMillis();
            System.out.println(".Running Merger against to resolve tuple set ");
            index.bulkQueryURLAndMetadataGivenInputStream(remoteFileSystem, conf, tempFilePath, linkData,
                    merger);
            long timeEnd = System.currentTimeMillis();
            LOG.info(".Merged Successfully in:" + (timeEnd - timeStart));
        } finally {
            LOG.info("Closing Merger");
            merger.close();
        }
    } finally {
        LOG.info("Closing Writer");
        outputWriter.close();
    }
}