Example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer.

Prototype

public DataOutputBuffer()

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.apache.tez.runtime.library.common.TestValuesIterator.java

License:Apache License

/**
 * create inmemory segments/* w w w  . j ava 2 s  .c  o m*/
 *
 * @return
 * @throws IOException
 */
public List<TezMerger.Segment> createInMemStreams() throws IOException {
    int numberOfStreams = Math.max(2, rnd.nextInt(10));
    LOG.info("No of streams : " + numberOfStreams);

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(keyClass);
    Serializer valueSerializer = serializationFactory.getSerializer(valClass);

    LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext context = createTezInputContext();
    MergeManager mergeManager = new MergeManager(conf, fs, localDirAllocator, context, null, null, null, null,
            null, 1024 * 1024 * 10, null, false, -1);

    DataOutputBuffer keyBuf = new DataOutputBuffer();
    DataOutputBuffer valBuf = new DataOutputBuffer();
    DataInputBuffer keyIn = new DataInputBuffer();
    DataInputBuffer valIn = new DataInputBuffer();
    keySerializer.open(keyBuf);
    valueSerializer.open(valBuf);

    List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>();
    for (int i = 0; i < numberOfStreams; i++) {
        BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
        InMemoryWriter writer = new InMemoryWriter(bout);
        Map<Writable, Writable> data = createData();
        //write data
        for (Map.Entry<Writable, Writable> entry : data.entrySet()) {
            keySerializer.serialize(entry.getKey());
            valueSerializer.serialize(entry.getValue());
            keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength());
            valIn.reset(valBuf.getData(), 0, valBuf.getLength());
            writer.append(keyIn, valIn);
            originalData.put(entry.getKey(), entry.getValue());
            keyBuf.reset();
            valBuf.reset();
            keyIn.reset();
            valIn.reset();
        }
        IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0,
                bout.getBuffer().length);
        segments.add(new TezMerger.Segment(reader, true));

        data.clear();
        writer.close();
    }
    return segments;
}

From source file:org.apache.tez.runtime.library.output.TestOnFileSortedOutput.java

License:Apache License

private OutputContext createTezOutputContext() throws IOException {
    String[] workingDirs = { workingDir.toString() };
    UserPayload payLoad = TezUtils.createUserPayloadFromConf(conf);
    DataOutputBuffer serviceProviderMetaData = new DataOutputBuffer();
    serviceProviderMetaData.writeInt(PORT);

    TezCounters counters = new TezCounters();

    OutputContext context = mock(OutputContext.class);
    doReturn(counters).when(context).getCounters();
    doReturn(workingDirs).when(context).getWorkDirs();
    doReturn(payLoad).when(context).getUserPayload();
    doReturn(5 * 1024 * 1024l).when(context).getTotalMemoryAvailableToTask();
    doReturn(UniqueID).when(context).getUniqueIdentifier();
    doReturn("v1").when(context).getDestinationVertexName();
    doReturn(ByteBuffer.wrap(serviceProviderMetaData.getData())).when(context)
            .getServiceProviderMetaData(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID);
    doAnswer(new Answer() {
        @Override//  w w  w.java 2  s .  c  o  m
        public Object answer(InvocationOnMock invocation) throws Throwable {
            long requestedSize = (Long) invocation.getArguments()[0];
            MemoryUpdateCallbackHandler callback = (MemoryUpdateCallbackHandler) invocation.getArguments()[1];
            callback.memoryAssigned(requestedSize);
            return null;
        }
    }).when(context).requestInitialMemory(anyLong(), any(MemoryUpdateCallback.class));
    ExecutionContext ExecutionContext = mock(ExecutionContext.class);
    doReturn(HOST).when(ExecutionContext).getHostName();
    doReturn(ExecutionContext).when(context).getExecutionContext();
    return context;
}

From source file:org.apache.twill.internal.yarn.YarnUtils.java

License:Apache License

/**
 * Encodes the given {@link Credentials} as bytes.
 *//*from w  w w  . j  av a  2 s.  com*/
public static ByteBuffer encodeCredentials(Credentials credentials) {
    try {
        DataOutputBuffer out = new DataOutputBuffer();
        credentials.writeTokenStorageToStream(out);
        return ByteBuffer.wrap(out.getData(), 0, out.getLength());
    } catch (IOException e) {
        // Shouldn't throw
        LOG.error("Failed to encode Credentials.", e);
        throw Throwables.propagate(e);
    }
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java

License:Open Source License

private static void compareKeys(RawComparator<TextBytes> comparator, TextBytes key1, TextBytes key2,
        int expectedResult) {
    long nanoStart = System.nanoTime();
    Assert.assertEquals(comparator.compare(key1, key2), expectedResult);
    long nanoEnd = System.nanoTime();
    System.out.println("Object Comparison Took:" + (nanoEnd - nanoStart));
    DataOutputBuffer outputBuffer1 = new DataOutputBuffer();
    DataOutputBuffer outputBuffer2 = new DataOutputBuffer();
    try {//from  ww  w.  ja v a  2s .co  m
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        nanoStart = System.nanoTime();
        Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(),
                outputBuffer2.getData(), 0, outputBuffer2.getLength()), expectedResult);
        nanoEnd = System.nanoTime();
        System.out.println("Raw Comparison Took:" + (nanoEnd - nanoStart));
        int offset1 = outputBuffer1.getLength();
        int offset2 = outputBuffer2.getLength();
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        Assert.assertEquals(
                comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1,
                        outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2),
                expectedResult);

        if (comparator instanceof LinkKeyComparator) {
            DataInputBuffer inputStream1 = new DataInputBuffer();
            DataInputBuffer inputStream2 = new DataInputBuffer();

            inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength());
            inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength());

            CrawlDBKey cdbkey1 = new CrawlDBKey();
            CrawlDBKey cdbkey2 = new CrawlDBKey();

            cdbkey1.readFields(inputStream1);
            cdbkey2.readFields(inputStream2);

            CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator();
            System.out.println("*Comparing Using CrawlDBKey Comparator");
            nanoStart = System.nanoTime();
            Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2), expectedResult);
            nanoEnd = System.nanoTime();
            System.out.println("Typed Comparison Took:" + (nanoEnd - nanoStart));

        }

    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.commoncrawl.service.crawler.CrawlerEngine.java

License:Open Source License

FlexBuffer getActiveHostListAsBuffer() throws IOException {
    if (_crawlActive && _httpCrawlQueue != null) {

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        Set<Integer> ipAddressSet = _httpCrawlQueue.getActiveHostIPs();

        WritableUtils.writeVInt(outputBuffer, ipAddressSet.size());

        for (int hostIP : ipAddressSet) {
            WritableUtils.writeVInt(outputBuffer, hostIP);
        }/*  ww  w . j a va2  s .  c om*/

        return new FlexBuffer(outputBuffer.getData(), 0, outputBuffer.getLength());
    }
    return null;
}

From source file:org.commoncrawl.service.crawler.CrawlLog.java

License:Open Source License

private void flushLog(final FlushCompletionCallback completionCallback) {
    if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH:Collecting Entries....");
    // set flush in progress indicator ...
    setFlushInProgress(true);/*w w w .  j a v  a 2s . c  o  m*/
    // and collect buffers in async thread context (thus not requiring
    // synchronization)
    final LinkedList<CrawlSegmentLog.LogItemBuffer> collector = new LinkedList<CrawlSegmentLog.LogItemBuffer>();
    // flush robots log
    _robotsSegment.flushLog(collector);
    // walk segments collecting log items ....
    for (CrawlSegmentLog logger : _loggers.values()) {
        // flush any log items into the collector
        logger.flushLog(collector);
    }
    if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH:Collection Returned " + collector.size() + " Buffers");

    // walk collector list identifying the list of unique segment ids
    final Set<Long> packedSegmentIdSet = new HashSet<Long>();

    int urlItemCount = 0;

    for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
        if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
            packedSegmentIdSet.add(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
        }
        urlItemCount += buffer.getItemCount();
    }

    if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH:There are  " + urlItemCount + " Items in Flush Buffer Associated With "
                + packedSegmentIdSet.size() + " Segments");

    final File crawlLogFile = getActivePath(_rootDirectory);

    // now check to see if there is anything to do ...
    if (collector.size() != 0) {
        if (Environment.detailLogEnabled())
            LOG.info("LOG_FLUSH: Collector Size is NOT Zero... Starting Log Flusher Thread");
        // ok ... time to spawn a thread to do the blocking flush io
        _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

                new Callable<Boolean>() {

                    public Boolean call() throws Exception {

                        if (Environment.detailLogEnabled())
                            LOG.info("LOG_FLUSH: Log Flusher Thread Started");
                        long startTime = System.currentTimeMillis();

                        Map<Long, DataOutputStream> streamsMapByPackedId = new HashMap<Long, DataOutputStream>();
                        Map<Long, Integer> recordCountsByPackedId = new HashMap<Long, Integer>();

                        long crawlLogRecordCount = 0;

                        // open the actual crawler log file ...
                        final DataOutputStream crawlLogStream = new DataOutputStream(
                                new FileOutputStream(crawlLogFile, true));

                        try {
                            if (Environment.detailLogEnabled())
                                LOG.info(
                                        "LOG_FLUSH: Log Flusher Thread Opening Streams for Segments in Buffer");
                            // now open a set of file descriptors related to the identified
                            // segments
                            for (long packedSegmentId : packedSegmentIdSet) {
                                // construct the unique filename for the given log file...
                                File activeSegmentLog = CrawlSegmentLog.buildActivePath(_rootDirectory,
                                        getListIdFromLogId(packedSegmentId),
                                        getSegmentIdFromLogId(packedSegmentId));
                                // initialize the segment log ...
                                CrawlSegmentLog.initializeLogFile(activeSegmentLog);
                                // initialize record counts per stream ...
                                recordCountsByPackedId.put(packedSegmentId,
                                        CrawlSegmentLog.readerHeader(activeSegmentLog));
                                // and open an output stream for the specified log file ...
                                streamsMapByPackedId.put(packedSegmentId,
                                        new DataOutputStream(new FileOutputStream(activeSegmentLog, true)));
                            }

                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Log Flusher Thread Walking Items in Buffer");

                            // initialize a total item count variable
                            int totalItemCount = 0;

                            // crawl history stream
                            DataOutputBuffer historyStream = new DataOutputBuffer();

                            // and now walk log buffers ...
                            for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
                                if (Environment.detailLogEnabled())
                                    LOG.info("LOG_FLUSH: Log Flusher Thread Writing " + buffer.getItemCount()
                                            + " Entries for Segment:" + buffer.getSegmentId());

                                // output stream
                                DataOutputStream segmentLogStream = null;

                                if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
                                    // update segment count first ...
                                    recordCountsByPackedId.put(
                                            makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()),
                                            recordCountsByPackedId.get(
                                                    makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()))
                                                    + buffer.getItemCount());
                                    // get output stream associated with segment id
                                    segmentLogStream = streamsMapByPackedId
                                            .get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
                                }

                                // and our local record counter ...
                                crawlLogRecordCount += buffer.getItemCount();

                                // and next do the actual disk flush ...
                                totalItemCount += buffer.flushToDisk(totalItemCount,

                                        new CrawlSegmentLog.LogItemBuffer.CrawlURLWriter() {

                                            SyncedCrawlURLLogWriter syncedLogWriter = new SyncedCrawlURLLogWriter();

                                            public void writeItem(CrawlURL url) throws IOException {
                                                // log it
                                                logCrawlLogWrite(url, url.getContentSize());
                                                // write it
                                                syncedLogWriter.writeItem(crawlLogStream, url);
                                            }

                                            public void writeItemCount(int entryCount) throws IOException {
                                            }

                                        }, segmentLogStream, historyStream);
                            }

                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Log Flusher Finished Writing Entries To Disk");
                            collector.clear();

                        } catch (IOException e) {
                            LOG.error("Critical Exception during Crawl Log Flush:"
                                    + CCStringUtils.stringifyException(e));
                            throw e;
                        } finally {
                            if (crawlLogStream != null) {
                                crawlLogStream.flush();
                                crawlLogStream.close();
                            }

                            for (DataOutputStream stream : streamsMapByPackedId.values()) {
                                if (stream != null)
                                    stream.flush();
                                stream.close();
                            }
                        }
                        // at this point... update the crawl log header ...
                        try {
                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Updating Log File Headers");
                            // update the log file header
                            updateLogFileHeader(crawlLogFile, _header, crawlLogRecordCount);
                            // and update each completion log header ...
                            for (long packedSegmentId : recordCountsByPackedId.keySet()) {
                                File activeSegmentLogPath = CrawlSegmentLog.buildActivePath(_rootDirectory,
                                        getListIdFromLogId(packedSegmentId),
                                        getSegmentIdFromLogId(packedSegmentId));
                                CrawlSegmentLog.writeHeader(activeSegmentLogPath,
                                        recordCountsByPackedId.get(packedSegmentId));
                            }
                        } catch (IOException e) {
                            LOG.error("Criticial Exception during Crawl Log Fluhs:"
                                    + CCStringUtils.stringifyException(e));
                            throw e;
                        } finally {

                        }

                        long endTime = System.currentTimeMillis();

                        _flushTimeAVG.addSample((double) endTime - startTime);
                        _flushTimeSmoothed.addSample((double) endTime - startTime);
                        _lastFlushTime = endTime - startTime;

                        LOG.info("LOG_FLUSH: Log Flusher Flushed Successfully");
                        return true;
                    }
                },

                new CompletionCallback<Boolean>() {

                    public void taskComplete(Boolean updateResult) {
                        setFlushInProgress(false);
                        if (completionCallback != null) {
                            completionCallback.flushComplete();
                        }
                    }

                    public void taskFailed(Exception e) {

                        setFlushInProgress(false);

                        if (completionCallback != null) {
                            completionCallback.flushFailed(e);
                        }

                        // all failures are critical in this particular task ...
                        LOG.fatal("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

                        // no matter ... it is time to CORE the server ...
                        throw new RuntimeException("CRITICAL FAILURE: Crawl Log FLUSH Threw Exception:"
                                + CCStringUtils.stringifyException(e));

                    }
                }));
    } else {
        setFlushInProgress(false);
        if (completionCallback != null) {
            completionCallback.flushComplete();
        }
    }
}

From source file:org.commoncrawl.service.crawler.SegmentLoader.java

License:Open Source License

@SuppressWarnings("unchecked")
public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId, int segmentId, String crawlerName,
        CancelOperationCallback cancelCallback) throws IOException {

    CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap();

    WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");

    // construct hdfs path to segment ... 
    Path hdfsPath;/*  w  ww .  jav  a  2 s  .c  om*/
    if (segmentId != -1)
        hdfsPath = new Path(
                CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/");
    else
        hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/");

    Path workUnitDetailPath = new Path(hdfsPath, crawlerName);

    SequenceFile.Reader reader = null;

    try {
        FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
        reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig());

        LongWritable hostFP = new LongWritable();
        CrawlSegmentHost segmentHost = new CrawlSegmentHost();

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        int segmentUrlCount = 0;
        while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) {
            // and update url count ... 
            segmentUrlCount += segmentHost.getUrlTargets().size();

            // set the url vector to the appropriate size ... 
            for (CrawlSegmentURL url : segmentHost.getUrlTargets()) {

                WritableUtils.writeVLong(outputBuffer, segmentHost.getHostFP());
                WritableUtils.writeVLong(outputBuffer, url.getUrlFP());
            }
        }
        outputBuffer.flush();
        // ok set the urlfp stream 
        fpMap.setURLFPBuffer(segmentUrlCount, outputBuffer.getData(), outputBuffer.getLength());
        // now initialize the 

        if (cancelCallback.cancelOperation()) {
            return null;
        } else {
            return fpMap;
        }
    } finally {
        if (reader != null)
            reader.close();
    }
}

From source file:org.commoncrawl.service.listcrawler.CacheWriterThread.java

License:Open Source License

@Override
public void run() {

    boolean shutdown = false;

    while (!shutdown) {
        try {//w w w  . j  av a2s. c om
            final CacheWriteRequest request = _writeRequestQueue.take();

            switch (request._requestType) {

            case ExitThreadRequest: {
                // shutdown condition ... 
                CacheManager.LOG.info("Disk Writer Thread Received Shutdown. Exiting!");
                shutdown = true;
            }
                break;

            case WriteRequest: {

                long timeStart = System.currentTimeMillis();

                try {
                    // reset crc calculator (single thread so no worries on synchronization)
                    _crc32Out.reset();

                    // figure out if we need to compress the item ... 
                    if ((request._item.getFlags() & CacheItem.Flags.Flag_IsCompressed) == 0
                            && request._item.getContent().getCount() != 0) {
                        LOG.info("Incoming Cache Request Content for:" + request._item.getUrl()
                                + " is not compressed. Compressing...");
                        ByteStream compressedBytesOut = new ByteStream(request._item.getContent().getCount());
                        ThriftyGZIPOutputStream gzipOutputStream = new ThriftyGZIPOutputStream(
                                compressedBytesOut);
                        gzipOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0,
                                request._item.getContent().getCount());
                        gzipOutputStream.finish();
                        LOG.info("Finished Compressing Incoming Content for:" + request._item.getUrl()
                                + " BytesIn:" + request._item.getContent().getCount() + " BytesOut:"
                                + compressedBytesOut.size());
                        // replace buffer

                        request._item.setContent(
                                new FlexBuffer(compressedBytesOut.getBuffer(), 0, compressedBytesOut.size()));
                        request._item.setFlags((request._item.getFlags() | CacheItem.Flags.Flag_IsCompressed));
                    }

                    // create streams ...
                    ByteStream bufferOutputStream = new ByteStream(8192);

                    CheckedOutputStream checkedStream = new CheckedOutputStream(bufferOutputStream, _crc32Out);
                    DataOutputStream dataOutputStream = new DataOutputStream(checkedStream);

                    // remember if this item has content ... 
                    boolean hasContent = request._item.isFieldDirty(CacheItem.Field_CONTENT);
                    // now mark the content field as clean, so that it will not be serialized in our current serialization attempt ... 
                    request._item.setFieldClean(CacheItem.Field_CONTENT);
                    // and go ahead and write out the data to the intermediate buffer while also computing partial checksum 
                    request._item.write(dataOutputStream);

                    request._item.setFieldDirty(CacheItem.Field_CONTENT);

                    // ok, now ... write out file header ... 
                    CacheItemHeader itemHeader = new CacheItemHeader(_manager.getLocalLogSyncBytes());

                    itemHeader._status = CacheItemHeader.STATUS_ALIVE;
                    itemHeader._lastAccessTime = System.currentTimeMillis();
                    itemHeader._fingerprint = request._itemFingerprint;
                    // compute total length ... 

                    // first the header bytes in the cacheItem 
                    itemHeader._dataLength = bufferOutputStream.size();
                    // next the content length (encoded - as in size + bytes) ... 
                    itemHeader._dataLength += 4 + request._item.getContent().getCount();
                    // lastly the crc value iteself ... 
                    itemHeader._dataLength += 8;
                    // open the log file ... 
                    DataOutputBuffer logStream = new DataOutputBuffer();

                    // ok, go ahead and write the header 
                    itemHeader.writeHeader(logStream);
                    // ok now write out the item data minus content... 
                    logStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
                    // now create a checked stream for the content ... 
                    CheckedOutputStream checkedStream2 = new CheckedOutputStream(logStream,
                            checkedStream.getChecksum());

                    dataOutputStream = new DataOutputStream(checkedStream2);

                    // content size 
                    dataOutputStream.writeInt(request._item.getContent().getCount());
                    // now write out the content (via checked stream so that we can calc checksum on content)
                    dataOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0,
                            request._item.getContent().getCount());
                    // ok ... lastly write out the checksum bytes ... 
                    dataOutputStream.writeLong(checkedStream2.getChecksum().getValue());
                    // and FINALLY, write out the total item bytes (so that we can seek in reverse to read last request log 
                    logStream.writeInt(CacheItemHeader.SIZE + itemHeader._dataLength);

                    // ok flush everyting to the memory stream 
                    dataOutputStream.flush();

                    //ok - time to acquire the log semaphore 
                    //LOG.info("Acquiring Local Log Semaphore");
                    _manager.getLocalLogAccessSemaphore().acquireUninterruptibly();

                    try {

                        // now time to acquire the write semaphore ... 
                        _manager.getLocalLogWriteAccessSemaphore().acquireUninterruptibly();

                        // get the current file position 
                        long recordOffset = _manager.getLocalLogFilePos();

                        try {

                            long ioTimeStart = System.currentTimeMillis();

                            RandomAccessFile logFile = new RandomAccessFile(_manager.getActiveLogFilePath(),
                                    "rw");

                            try {
                                // seek to our known record offset 
                                logFile.seek(recordOffset);
                                // write out the data
                                logFile.write(logStream.getData(), 0, logStream.getLength());
                            } finally {
                                logFile.close();
                            }
                            // now we need to update the file header 
                            _manager.updateLogFileHeader(_manager.getActiveLogFilePath(), 1,
                                    CacheItemHeader.SIZE + itemHeader._dataLength + 4 /*trailing bytes*/);

                            CacheManager.LOG
                                    .info("#### Wrote Cache Item in:" + (System.currentTimeMillis() - timeStart)
                                            + " iotime:" + (System.currentTimeMillis() - ioTimeStart)
                                            + " QueueSize:" + _writeRequestQueue.size());

                        } finally {
                            // release write semaphore quickly 
                            _manager.getLocalLogWriteAccessSemaphore().release();
                        }

                        // now inform the manager of the completed request ... 
                        _manager.writeRequestComplete(request, recordOffset);
                    } finally {
                        //LOG.info("Releasing Local Log Semaphore");
                        _manager.getLocalLogAccessSemaphore().release();
                    }
                } catch (IOException e) {
                    CacheManager.LOG.error("### FUC# BATMAN! - GONNA LOSE THIS REQUEST!!!!:"
                            + CCStringUtils.stringifyException(e));
                    _manager.writeRequestFailed(request, e);
                }
            }
                break;
            }
        } catch (InterruptedException e) {

        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException {

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);

    SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath,
            CrawlEnvironment.getHadoopConfig());

    ValueBytes valueBytes = indexReader.createValueBytes();
    DataOutputBuffer keyBytes = new DataOutputBuffer();
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataOutputBuffer finalOutputStream = new DataOutputBuffer();
    DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
    URLFP fp = new URLFP();

    try {/*from w  w w.j a  v a 2  s  . c  o m*/
        while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {

            keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
            // read fingerprint ...
            fp.readFields(keyBuffer);
            // write hash only
            finalOutputStream.writeLong(fp.getUrlHash());
            uncompressedValueBytes.reset();
            // write value bytes to intermediate buffer ...
            valueBytes.writeUncompressedBytes(uncompressedValueBytes);
            // write out uncompressed length
            WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength());
            // write out bytes
            finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength());
        }
        // delete existing ...
        cacheFilePath.delete();
        // compute crc ...
        CRC32 crc = new CRC32();
        crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
        // open final output stream
        DataOutputStream fileOutputStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(cacheFilePath)));

        try {
            fileOutputStream.writeLong(crc.getValue());
            fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength());
            fileOutputStream.flush();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            fileOutputStream.close();
            fileOutputStream = null;
            cacheFilePath.delete();
            throw e;
        } finally {
            if (fileOutputStream != null) {
                fileOutputStream.close();
            }
        }
    } finally {
        if (indexReader != null) {
            indexReader.close();
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria,
        ItemUpdater targetList) throws IOException {

    // ok copy stuff locally if possible ...
    File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
    File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
    File localBloomFilterPath = new File(getLocalDataDir(),
            CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);

    // ok copy local first
    if (!localIndexPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:"
                + localIndexPath.getAbsolutePath());
        try {//from   w w w.j  a v a 2  s  .  c o  m
            _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localIndexPath.delete();
            throw e;
        }
    }
    if (!localDataPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:"
                + localDataPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localDataPath.delete();
            throw e;
        }

    }
    if (!localBloomFilterPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:"
                + localBloomFilterPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localBloomFilterPath.delete();
            throw e;
        }

    }

    // ok open local
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
            new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

    try {
        URLFP firstIndexKey = null;
        URLFP lastIndexKey = new URLFP();
        LongWritable position = new LongWritable();
        while (indexReader.next(lastIndexKey, position)) {
            if (firstIndexKey == null) {
                try {
                    firstIndexKey = (URLFP) lastIndexKey.clone();
                } catch (CloneNotSupportedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:"
                + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:"
                + lastIndexKey.getUrlHash());

        URLFP criteriaFirstKey = criteria.first();
        URLFP criteriaLastKey = criteria.last();

        if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) {
            LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
            LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash()
                    + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:"
                    + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash());
            return;
        }
    } finally {
        indexReader.close();
    }

    LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan");
    // load bloom filter
    FSDataInputStream bloomFilterStream = localFileSystem
            .open(new Path(localBloomFilterPath.getAbsolutePath()));

    int hitCount = 0;

    try {
        URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);

        URLFP fpOut = new URLFP();
        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
        DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
        ValueBytes valueBytes = null;
        DataInputBuffer valueReader = new DataInputBuffer();
        DataOutputBuffer keyBytes = new DataOutputBuffer();
        DataInputBuffer keyReader = new DataInputBuffer();

        URLFP lastFP = null;

        outerLoop:
        // now iterate each item in the criteria
        for (URLFP targetFP : criteria) {
            // if fingerprint is present in filter ...
            if (filter.isPresent(targetFP)) {
                // check to see if reader is initialzied ...
                if (reader == null) {
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()),
                            CrawlEnvironment.getHadoopConfig());
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    valueBytes = reader.createValueBytes();
                }

                // if last read fingerprint was not null ...
                if (lastFP != null) {
                    // does it match the current item
                    if (lastFP.compareTo(targetFP) == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);
                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash()
                                + " File:" + dataFilePath);
                        // if so, null out last fp
                        lastFP = null;
                        // and update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;

                        continue;
                    }
                }

                // ok at this point .. read the next item in the list ...
                lastFP = null;

                while (reader.nextRaw(keyBytes, valueBytes) != -1) {
                    // init reader ...
                    keyReader.reset(keyBytes.getData(), keyBytes.getLength());
                    // read key
                    fpOut.readFields(keyReader);
                    // reset output buffer
                    keyBytes.reset();

                    // LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
                    // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
                    // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
                    // compare it to target ...
                    int result = fpOut.compareTo(targetFP);
                    // ok does it match .. ?
                    if (result == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);

                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash()
                                + " File:" + dataFilePath);
                        // update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;
                        // and break to outer loop
                        continue outerLoop;
                    } else if (result == 1) {
                        // LOG.info("LIST:" + listId +
                        // " FP Comparison Returned 1. Going to OuterLoop");
                        // update last FP
                        lastFP = fpOut;
                        // continue outer loop
                        continue outerLoop;
                    } else {
                        // otherwise skip
                    }
                }
                // ok if we got here .. we are done reading the sequence file and did
                // not find a trailing match
                LOG.warn("LIST:" + listId
                        + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
                        + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")");
                // break out of outer loop

                break;
            }
        }
    } finally {
        bloomFilterStream.close();

        if (reader != null) {
            reader.close();
        }

        LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount);
    }
}