Example usage for org.apache.hadoop.io DataOutputBuffer getLength

List of usage examples for org.apache.hadoop.io DataOutputBuffer getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataOutputBuffer getLength.

Prototype

public int getLength() 

Source Link

Document

Returns the length of the valid data currently in the buffer.

Usage

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/**
 * Initialize a new CrawlList object from a given input stream of urls 
 * /*from   www.j  av a 2  s .c  om*/
 * @param manager           - reference to the crawl history log manager 
 * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... 
 * @throws IOException      
 */
public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval)
        throws IOException {

    _manager = manager;

    _listState = LoadState.REALLY_LOADING;

    // initialize a new list id 
    _listId = listId;

    LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath());

    //establish file names 
    initializeListFileNames();

    sourceURLFile.renameTo(_listURLDataFile);

    FileInputStream urlInputStream = new FileInputStream(_listURLDataFile);

    try {

        // set we will use to hold all fingerprints generated 
        TreeSet<URLFP> urlSet = new TreeSet<URLFP>();

        // create temp files ...
        File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId));

        // create mergesortspillwriter 
        SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(),
                new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null,
                false);

        try {

            MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>(
                    CrawlEnvironment.getHadoopConfig(), spillwriter,
                    FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                    new Path(manager.getLocalDataDir().getAbsolutePath()), null,
                    new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() {

                        DataInputBuffer _key1Buffer = new DataInputBuffer();
                        DataInputBuffer _key2Buffer = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            _key1Buffer.reset(key1Data, key1Offset, key1Length);
                            _key2Buffer.reset(key2Data, key2Offset, key2Length);

                            _key1Buffer.skip(2); // skip verison, and 1 byte id 
                            _key2Buffer.skip(2); // skip verison, and 1 byte id 

                            int domainHash1 = WritableUtils.readVInt(_key1Buffer);
                            int domainHash2 = WritableUtils.readVInt(_key2Buffer);

                            _key1Buffer.skip(1); // skip 1 byte id 
                            _key2Buffer.skip(1); // skip 1 byte id 

                            long fingerprint1 = WritableUtils.readVLong(_key1Buffer);
                            long fingerprint2 = WritableUtils.readVLong(_key2Buffer);

                            int result = ((Integer) domainHash1).compareTo(domainHash2);

                            if (result == 0) {
                                result = ((Long) fingerprint1).compareTo(fingerprint2);
                            }

                            return result;
                        }

                        @Override
                        public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2,
                                ProxyCrawlHistoryItem value2) {
                            return key1.compareTo(key2);
                        }
                    }, URLFP.class, ProxyCrawlHistoryItem.class, false, null);

            try {

                LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List");
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(urlInputStream, Charset.forName("UTF-8")));

                String line = null;
                int lineNumber = 0;
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();
                while ((line = reader.readLine()) != null) {
                    ++lineNumber;
                    if (line.length() != 0 && !line.startsWith("#")) {
                        URLFP fingerprint = URLUtils.getURLFPFromURL(line, true);

                        if (fingerprint != null) {

                            if (!urlSet.contains(fingerprint)) {
                                // and add fingerprint to set 
                                urlSet.add(fingerprint);
                                // initialize item 
                                item.clear();
                                item.setOriginalURL(line);
                                // and spill to merger / sorter .. 
                                merger.spillRecord(fingerprint, item);
                            }
                        } else {
                            LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:"
                                    + lineNumber + " URL" + line);
                        }
                    }
                }
                LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS");
            } finally {
                merger.close();
            }
        } finally {
            if (spillwriter != null)
                spillwriter.close();
        }
        LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys");
        // generate bloom filter ...  
        _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10);

        for (URLFP fingerprint : urlSet) {
            _bloomFilter.add(fingerprint);
        }
        LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter");
        // serialize it
        FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData);
        try {
            _bloomFilter.serialize(bloomFilterStream);
        } finally {
            bloomFilterStream.flush();
            bloomFilterStream.close();
        }

        LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile);
        // now initialize value map and string maps based on output sequence file ... 
        SequenceFile.Reader reader = new SequenceFile.Reader(
                FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()),
                new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

        LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:"
                + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        // OK, Allocate room for fixed data file upfront 
        DataOutputBuffer valueStream = new DataOutputBuffer(
                urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE);
        LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED");

        try {

            //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile));
            RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw");

            try {
                URLFP urlFP = new URLFP();
                ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem();

                // read fingerprints ... 
                while (reader.next(urlFP, item)) {
                    // write out fixed data structure and strings 
                    writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream);
                }
            } finally {
                //valueStream.flush();
                //valueStream.close();
                stringsStream.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk");

        LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength()
                + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) {
            throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength()
                    + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE));
        }
        // initialize temp data buffer variables 
        _tempFixedDataBuffer = valueStream.getData();
        _tempFixedDataBufferSize = valueStream.getLength();

        // update metadata 
        _metadata.setRefreshInterval(refreshInterval);
        _metadata.setUrlCount(urlSet.size());

        // setup version 
        _metadata.setVersion(1);

        // and write to disk 
        writeMetadataToDisk();

        // mark state as loaded ... 
        _listState = LoadState.LOADED;

        LOG.info("*** LIST:" + getListId() + " SYNCING");
        // reconcile with history log
        _manager.syncList(this.getListId(), urlSet, this);
        LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE");

        // write metdata to disk again 
        writeMetadataToDisk();

        LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA");

        // and finally flush fixed data to disk 
        FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile);

        try {
            synchronized (this) {
                int blockSize = 1 << 20;
                long bytesCopied = 0;
                for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) {
                    int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset);
                    finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy);
                    bytesCopied += bytesToCopy;
                }
                // validate bytes copied 
                if (bytesCopied != _tempFixedDataBufferSize) {
                    throw new IOException("Buffer Size:" + _tempFixedDataBufferSize
                            + " Does not Match BytesCopied:" + bytesCopied);
                }

                // ok release the buffer 
                _tempFixedDataBuffer = null;
                _tempFixedDataBufferSize = 0;

                LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE");
            }

        } finally {
            finalDataStream.flush();
            finalDataStream.close();
        }

        // load sub domain metadata from disk ... 
        loadSubDomainMetadataFromDisk();

    } catch (IOException e) {
        LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:"
                + CCStringUtils.stringifyException(e));

        _fixedDataFile.delete();
        _variableDataFile.delete();
        _bloomFilterData.delete();

        _listState = LoadState.ERROR;

        throw e;
    } finally {
        urlInputStream.close();
    }

}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

private int calculateStringCRC(ProxyCrawlHistoryItem item, DataOutputBuffer stringBuffer) throws IOException {
    stringBuffer.reset();//from ww  w . java 2s .  c o  m
    stringBuffer.writeUTF(item.getOriginalURL());
    if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) {
        stringBuffer.writeUTF(item.getRedirectURL());
    }
    _stringCRC.reset();
    _stringCRC.update(stringBuffer.getData(), 0, stringBuffer.getLength());

    return (int) _stringCRC.getValue();
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

/**
 * serialize metadata to disk //from   ww w. j a v  a2  s.  c  o  m
 * @throws IOException
 */
void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData) throws IOException {

    DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

    subDomainData.serialize(outputBuffer, new BinaryProtocol());

    if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) {
        LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!");
        outputBuffer.reset();
        subDomainData.setDomainName("<<CORRUPT>>");
        subDomainData.serialize(outputBuffer, new BinaryProtocol());
    }

    synchronized (_subDomainMetadataFile) {
        RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");
        try {
            if (subDomainData.getSubDomainDataOffset() == 0) {
                throw new IOException("Data Offset Zero during write!");
            }
            file.seek(subDomainData.getSubDomainDataOffset());
            file.write(outputBuffer.getData(), 0, outputBuffer.getLength());
        } finally {
            file.close();
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlList.java

License:Open Source License

void writeInitialSubDomainMetadataToDisk() throws IOException {

    RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw");

    try {/*from  www .j a  va  2s  . c o m*/

        file.writeByte(0); // version
        file.writeInt(_transientSubDomainStats.size());

        ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>();
        sortedMetadata.addAll(_transientSubDomainStats.values());
        _transientSubDomainStats = null;
        CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]);
        Arrays.sort(metadataArray, new Comparator<CrawlListMetadata>() {

            @Override
            public int compare(CrawlListMetadata o1, CrawlListMetadata o2) {
                int result = ((Integer) o2.getUrlCount()).compareTo(o1.getUrlCount());
                if (result == 0) {
                    result = o1.getDomainName().compareTo(o2.getDomainName());
                }
                return result;
            }
        });

        DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize);

        TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>();

        for (CrawlListMetadata entry : metadataArray) {
            // reset output buffer 
            outputBuffer.reset();
            // write item to disk 
            entry.serialize(outputBuffer, new BinaryProtocol());

            if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) {
                LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:"
                        + entry.getDomainName());
                System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:"
                        + entry.getDomainName());
            }
            // save offset 
            idToOffsetMap.put(entry.getDomainHash(), (int) file.getFilePointer());
            // write out fixed data size 
            file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize);
        }

        // write lookup table 
        _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE);

        for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) {
            _offsetLookupTable.writeLong(entry.getKey());
            _offsetLookupTable.writeInt(entry.getValue());
        }
    } finally {
        file.close();
    }
    _transientSubDomainStats = null;
}

From source file:org.commoncrawl.service.parser.client.Dispatcher.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    CrawlEnvironment.setHadoopConfig(conf);
    String baseURL = "http://unknown.com/";
    if (args.length != 0) {
        baseURL = args[0];/*  ww w  .ja  va 2 s.  c o  m*/
    }
    URL baseURLObj;
    try {
        baseURLObj = new URL(baseURL);
    } catch (MalformedURLException e2) {
        throw new IOException("Invalid Base Link");
    }
    final URL finalBaseURL = (baseURLObj != null) ? baseURLObj : null;
    final DataOutputBuffer headerBuffer = new DataOutputBuffer();
    final DataOutputBuffer contentBuffer = new DataOutputBuffer();

    try {
        ByteStreams.readBytes(new InputSupplier<InputStream>() {

            @Override
            public InputStream getInput() throws IOException {
                return System.in;
            }
        }, new ByteProcessor<Long>() {

            @Override
            public Long getResult() {
                return 0L;
            }

            int currLineCharCount = 0;
            boolean processingHeaders = true;

            @Override
            public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                if (processingHeaders) {
                    int current = start;
                    int end = current + length;
                    while (processingHeaders && current != end) {
                        if (buf[current] != '\r' && buf[current] != '\n') {
                            currLineCharCount++;
                        } else if (buf[current] == '\n') {
                            if (currLineCharCount == 0) {
                                headerBuffer.write(buf, start, current - start + 1);
                                processingHeaders = false;
                            }
                            currLineCharCount = 0;
                        }
                        current++;
                    }
                    if (processingHeaders) {
                        headerBuffer.write(buf, start, length);
                    } else {
                        length -= current - start;
                        start = current;
                    }
                }
                if (!processingHeaders) {
                    contentBuffer.write(buf, start, length);
                }
                return true;
            }
        });

        LOG.info("HEADER LEN:" + headerBuffer.getLength());
        // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")));
        LOG.info("CONTENT LEN:" + contentBuffer.getLength());
        //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
        // decode header bytes ... 
        String header = "";
        if (headerBuffer.getLength() != 0) {
            try {
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("UTF-8"));
            } catch (Exception e) {
                LOG.warn(CCStringUtils.stringifyException(e));
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("ASCII"));
            }
        }
        final String headersFinal = (header != null) ? header : "";

        LOG.info("Starting Event Loop");
        final EventLoop eventLoop = new EventLoop();
        eventLoop.start();

        try {
            // create fake hosts file ...  
            //String hosts = "10.0.20.101:8072";
            // reader 
            //Reader reader = new StringReader(hosts);
            // dispatcher init 
            LOG.info("initializing Dispatcher");
            final Dispatcher dispatcher = new Dispatcher(eventLoop, "parserNodes");
            LOG.info("Waiting for a few seconds");
            Thread.sleep(5000);
            Thread threads[] = new Thread[TEST_THREAD_COUNT];
            final Semaphore threadWaitSem = new Semaphore(-TEST_THREAD_COUNT - 1);
            // start 100 threads 
            for (int threadIdx = 0; threadIdx < TEST_THREAD_COUNT; ++threadIdx) {
                threads[threadIdx] = new Thread(new Runnable() {

                    @Override
                    public void run() {
                        for (int i = 0; i < ITERATIONS_PER_THREAD; ++i) {
                            // build parse request 
                            ParseRequest request = new ParseRequest();
                            request.setDocId(1);
                            request.setDomainId(1);
                            request.setDocURL(finalBaseURL.toString());
                            request.setDocHeaders(headersFinal);
                            request.setDocContent(
                                    new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
                            //LOG.info("Dispatching parse request");
                            ParseResult result = dispatcher.dispatchRequest(request);
                            LOG.info("TID[" + Thread.currentThread().getId() + "]ReqID[" + i + "]" + " Success:"
                                    + ((result != null) ? result.getParseSuccessful() : false) + " LinkCount:"
                                    + ((result != null) ? result.getExtractedLinks().size() : 0));
                        }
                        LOG.info("Thread:" + Thread.currentThread().getId() + " Exiting");
                        threadWaitSem.release();
                    }

                });
                threads[threadIdx].start();
            }

            LOG.info("Waiting for threads to die");
            threadWaitSem.acquireUninterruptibly();
            LOG.info("All Threads dead.");

        } finally {
            eventLoop.stop();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    } catch (InterruptedException e) {
    }
}

From source file:org.commoncrawl.service.parser.server.ParseWorker.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String baseURL = "http://unknown.com/";
    if (args.length != 0) {
        baseURL = args[0];/*from w w  w .j  av a 2s.c  o m*/
    }
    URL baseURLObj;
    try {
        baseURLObj = new URL(baseURL);
    } catch (MalformedURLException e2) {
        throw new IOException("Invalid Base Link");
    }
    final DataOutputBuffer headerBuffer = new DataOutputBuffer();
    final DataOutputBuffer contentBuffer = new DataOutputBuffer();

    try {
        ByteStreams.readBytes(new InputSupplier<InputStream>() {

            @Override
            public InputStream getInput() throws IOException {
                return System.in;
            }
        }, new ByteProcessor<Long>() {

            @Override
            public Long getResult() {
                return 0L;
            }

            int currLineCharCount = 0;
            boolean processingHeaders = true;

            @Override
            public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                if (processingHeaders) {
                    int current = start;
                    int end = current + length;
                    while (processingHeaders && current != end) {
                        if (buf[current] != '\r' && buf[current] != '\n') {
                            currLineCharCount++;
                        } else if (buf[current] == '\n') {
                            if (currLineCharCount == 0) {
                                headerBuffer.write(buf, start, current - start + 1);
                                processingHeaders = false;
                            }
                            currLineCharCount = 0;
                        }
                        current++;
                    }
                    if (processingHeaders) {
                        headerBuffer.write(buf, start, length);
                    } else {
                        length -= current - start;
                        start = current;
                    }
                }
                if (!processingHeaders) {
                    contentBuffer.write(buf, start, length);
                }
                return true;
            }
        });

        //LOG.info("HEADER LEN:" + headerBuffer.getLength());
        // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")));
        //LOG.info("CONTENT LEN:" + contentBuffer.getLength());
        //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
        // decode header bytes ... 
        String header = "";
        if (headerBuffer.getLength() != 0) {
            try {
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("UTF-8"));
            } catch (Exception e) {
                LOG.warn(CCStringUtils.stringifyException(e));
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("ASCII"));
            }
        }

        //LOG.info("Parsing Document");
        ParseWorker worker = new ParseWorker();
        ParseResult result = new ParseResult();
        worker.parseDocument(result, 0L, 0L, baseURLObj, header,
                new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
        //LOG.info("Parse Result:" + result.getParseSuccessful()); 
        //LOG.info("Parse Data:" + result.toString());  

    } catch (IOException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    }
    /*
    List<String> lines;
    try {
      lines = IOUtils.readLines(System.in, "UTF-8");
      for (String line : lines){ 
        System.out.println(line);
      }
            
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }*/
}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId,
        long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException {

    File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
    tempFile.mkdir();/*from  ww w.  j  a v  a  2s. c  o  m*/

    try {
        // create the final output spill writer ...  
        SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>(
                outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)),
                true);

        try {

            MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>(
                    conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null,
                    new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null);

            try {

                for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) {
                    // 0. shard domain id to find index file location ... 
                    int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE)
                            % CrawlEnvironment.NUM_DB_SHARDS);
                    // build path to index file 
                    Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId
                            + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
                    LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:"
                            + indexFilePath);
                    // 1. scan domainFP to index file first
                    // 2. given index, scan index->pos file to find scan start position
                    // 3. given scan start position, scan forward until fp match is found.
                    // 4. collect all matching entries and output to a file ? 

                    FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
                    try {
                        TFile.Reader reader = new TFile.Reader(indexDataInputStream,
                                fs.getLength(indexFilePath), conf);
                        try {
                            TFile.Reader.Scanner scanner = reader.createScanner();

                            try {
                                // generate key ... 
                                DataOutputBuffer keyBuffer = new DataOutputBuffer();
                                keyBuffer.writeLong(targetRootDomainFP);
                                if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) {
                                    // setup for value scan 
                                    DataInputStream valueStream = scanner.entry().getValueStream();
                                    int dataOffsetOut = -1;
                                    while (valueStream.available() > 0) {
                                        // read entries looking for our specific entry
                                        int shardIdx = valueStream.readInt();
                                        int dataOffset = valueStream.readInt();
                                        if (shardIdx == targetShardId) {
                                            dataOffsetOut = dataOffset;
                                            break;
                                        }
                                    }
                                    LOG.info("Index Search Yielded:" + dataOffsetOut);
                                    if (dataOffsetOut != -1) {
                                        // ok create a data path 
                                        Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId));
                                        Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId) + ".index");
                                        // check to see if index is already loaded ... 
                                        PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
                                        synchronized (_shardToIndexMap) {
                                            index = _shardToIndexMap.get(targetShardId);
                                        }
                                        if (index == null) {
                                            LOG.info("Loading Index from Path:" + finalDataIndexPath);
                                            // load index
                                            index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(
                                                    fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
                                            // put in cache
                                            synchronized (_shardToIndexMap) {
                                                _shardToIndexMap.put(targetShardId, index);
                                            }
                                        }

                                        LOG.info("Initializing Data Reader at Path:" + finalDataPath);
                                        // ok time to create a reader 
                                        SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,
                                                finalDataPath, conf);

                                        try {
                                            LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
                                            index.seekReaderToItemAtIndex(dataReader, dataOffsetOut);

                                            FlexBuffer keyBytes = new FlexBuffer();
                                            URLFPV2 sourceFP = new URLFPV2();
                                            DataInputBuffer keyReader = new DataInputBuffer();
                                            TextBytes urlTxt = new TextBytes();

                                            // ok read to go ... 
                                            while (dataReader.next(keyBytes, sourceFP)) {
                                                // initialize reader 
                                                keyReader.reset(keyBytes.get(), keyBytes.getOffset(),
                                                        keyBytes.getCount());

                                                long targetFP = keyReader.readLong();

                                                if (targetRootDomainFP == targetFP) {
                                                    finalMerger.spillRecord(keyBytes, sourceFP);
                                                } else {
                                                    LOG.info("FP:" + targetFP + " > TargetFP:"
                                                            + targetRootDomainFP + " Exiting Iteration Loop");
                                                    break;
                                                }
                                            }
                                        } finally {
                                            LOG.info("Closing Reader");
                                            dataReader.close();
                                        }
                                    }
                                }
                            } finally {
                                LOG.info("Closing Scanner");
                                scanner.close();
                            }

                        } finally {
                            LOG.info("Closing TFile Reader");
                            reader.close();
                        }
                    } finally {
                        LOG.info("Closing InputStream");
                        indexDataInputStream.close();
                    }
                }
            } finally {
                finalMerger.close();
            }
        } finally {
            spillwriter.close();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        FileUtils.recursivelyDeleteFile(tempFile);
    }

}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem,
        Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath,
        Path inlinksDetailOutputPath) throws IOException {

    long recordCount = 0L;

    outputFileSystem.delete(inlinksDomainIndexPath);
    outputFileSystem.delete(inlinksDetailOutputPath);

    FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath);

    try {/*w w  w .ja v a2s. co  m*/

        FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath);
        FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath);

        ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>();

        try {

            LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath);
            CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream);

            InlinkingDomainInfo lastDomain = null;

            while (reader.hasNext()) {

                // read the nex fingerprint 
                URLFPV2 fingerprint = reader.next();
                // and first see if we have a domain transition 
                if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) {
                    // remember the domain 
                    lastDomain = new InlinkingDomainInfo();
                    lastDomain.setDomainId(fingerprint.getDomainHash());
                    // add it to the list 
                    domainList.add(lastDomain);
                    // update date position 
                    lastDomain.setUrlDataPos(detailOutputStream.getPos());
                }
                // increment url count for the domain
                lastDomain.setUrlCount(lastDomain.getUrlCount() + 1);

                detailOutputStream.writeLong(fingerprint.getDomainHash());
                detailOutputStream.writeLong(fingerprint.getUrlHash());

                recordCount++;
            }

            LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records");
            // ok, now resolve domain names
            for (InlinkingDomainInfo domain : domainList) {
                SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId());
                if (metadata == null) {
                    LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId());
                } else {
                    if (metadata.getDomainText().length() == 0) {
                        LOG.error("*** Metadata for Domain Id:" + domain.getDomainId()
                                + " contained NULL Name Value.");
                        domain.setDomainName("_ERROR:BAD RECORD");
                    } else {
                        domain.setDomainName(metadata.getDomainText());
                    }
                    //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount());
                }
            }

            LOG.info("Sorting Domain List of Size:" + domainList.size());
            // ok sort by domain name 
            Collections.sort(domainList);

            LOG.info("Building In Memory Index");

            // ok write out domain info
            DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer();
            DataOutputBuffer indexDataBuffer = new DataOutputBuffer();

            LOG.info("***Writing Domain List Size:" + domainList.size());
            indexHeaderBuffer.writeInt(domainList.size());

            // ok iterate and write to both buffers  
            for (InlinkingDomainInfo domain : domainList) {
                indexHeaderBuffer.writeInt(indexDataBuffer.getLength());
                domain.write(indexDataBuffer);
            }

            LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:"
                    + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength());
            // ok now flush both buffers to disk
            indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength());
            indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength());
        } finally {
            indexOutputStream.flush();
            indexOutputStream.close();
            detailOutputStream.flush();
            detailOutputStream.close();
        }
    } finally {
        remoteInputStream.close();
    }
    return recordCount;
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/* ww w .  j av a2 s .c  o  m*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.commoncrawl.util.GZIPUtils.java

License:Apache License

/**
 * Returns an gunzipped copy of the input array, truncated to
 * <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been
 * truncated or corrupted, a best-effort attempt is made to unzip as much as
 * possible. If no data can be extracted <code>null</code> is returned.
 *//*from w  w  w  .  jav  a2 s.  c o m*/
public static final UnzipResult unzipBestEffort(byte[] in, int offset, int sizeIn, int sizeLimit) {

    try {
        // decompress using GZIPInputStream
        DataOutputBuffer outStream = new DataOutputBuffer(EXPECTED_COMPRESSION_RATIO * in.length);

        boolean truncated = false;

        GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(in, offset, sizeIn));

        byte[] buf = new byte[BUF_SIZE];
        int written = 0;
        while (true) {
            try {
                int size = inStream.read(buf);
                if (size <= 0)
                    break;
                if ((written + size) > sizeLimit) {
                    outStream.write(buf, 0, sizeLimit - written);
                    truncated = true;
                    break;
                }
                outStream.write(buf, 0, size);
                written += size;
            } catch (Exception e) {
                break;
            }
        }
        try {
            outStream.close();
        } catch (IOException e) {
        }

        return new UnzipResult(outStream.getData(), 0, outStream.getLength(), truncated);

    } catch (IOException e) {
        return null;
    } catch (OutOfMemoryError e) {
        LOG.fatal(CCStringUtils.stringifyException(e));
        return null;
    }
}