Example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer

List of usage examples for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer.

Prototype

public DataOutputBuffer() 

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.apache.tez.runtime.library.common.TestValuesIterator.java

License:Apache License

/**
 * create inmemory segments/* w w w  . j ava 2 s  .c  o m*/
 *
 * @return
 * @throws IOException
 */
public List<TezMerger.Segment> createInMemStreams() throws IOException {
    int numberOfStreams = Math.max(2, rnd.nextInt(10));
    LOG.info("No of streams : " + numberOfStreams);

    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(keyClass);
    Serializer valueSerializer = serializationFactory.getSerializer(valClass);

    LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext context = createTezInputContext();
    MergeManager mergeManager = new MergeManager(conf, fs, localDirAllocator, context, null, null, null, null,
            null, 1024 * 1024 * 10, null, false, -1);

    DataOutputBuffer keyBuf = new DataOutputBuffer();
    DataOutputBuffer valBuf = new DataOutputBuffer();
    DataInputBuffer keyIn = new DataInputBuffer();
    DataInputBuffer valIn = new DataInputBuffer();
    keySerializer.open(keyBuf);
    valueSerializer.open(valBuf);

    List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>();
    for (int i = 0; i < numberOfStreams; i++) {
        BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
        InMemoryWriter writer = new InMemoryWriter(bout);
        Map<Writable, Writable> data = createData();
        //write data
        for (Map.Entry<Writable, Writable> entry : data.entrySet()) {
            keySerializer.serialize(entry.getKey());
            valueSerializer.serialize(entry.getValue());
            keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength());
            valIn.reset(valBuf.getData(), 0, valBuf.getLength());
            writer.append(keyIn, valIn);
            originalData.put(entry.getKey(), entry.getValue());
            keyBuf.reset();
            valBuf.reset();
            keyIn.reset();
            valIn.reset();
        }
        IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0,
                bout.getBuffer().length);
        segments.add(new TezMerger.Segment(reader, true));

        data.clear();
        writer.close();
    }
    return segments;
}

From source file:org.apache.tez.runtime.library.output.TestOnFileSortedOutput.java

License:Apache License

private OutputContext createTezOutputContext() throws IOException {
    String[] workingDirs = { workingDir.toString() };
    UserPayload payLoad = TezUtils.createUserPayloadFromConf(conf);
    DataOutputBuffer serviceProviderMetaData = new DataOutputBuffer();
    serviceProviderMetaData.writeInt(PORT);

    TezCounters counters = new TezCounters();

    OutputContext context = mock(OutputContext.class);
    doReturn(counters).when(context).getCounters();
    doReturn(workingDirs).when(context).getWorkDirs();
    doReturn(payLoad).when(context).getUserPayload();
    doReturn(5 * 1024 * 1024l).when(context).getTotalMemoryAvailableToTask();
    doReturn(UniqueID).when(context).getUniqueIdentifier();
    doReturn("v1").when(context).getDestinationVertexName();
    doReturn(ByteBuffer.wrap(serviceProviderMetaData.getData())).when(context)
            .getServiceProviderMetaData(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID);
    doAnswer(new Answer() {
        @Override//  w w  w.java 2  s .  c  o  m
        public Object answer(InvocationOnMock invocation) throws Throwable {
            long requestedSize = (Long) invocation.getArguments()[0];
            MemoryUpdateCallbackHandler callback = (MemoryUpdateCallbackHandler) invocation.getArguments()[1];
            callback.memoryAssigned(requestedSize);
            return null;
        }
    }).when(context).requestInitialMemory(anyLong(), any(MemoryUpdateCallback.class));
    ExecutionContext ExecutionContext = mock(ExecutionContext.class);
    doReturn(HOST).when(ExecutionContext).getHostName();
    doReturn(ExecutionContext).when(context).getExecutionContext();
    return context;
}

From source file:org.apache.twill.internal.yarn.YarnUtils.java

License:Apache License

/**
 * Encodes the given {@link Credentials} as bytes.
 *//*from w  w w  . j  av a  2 s.  com*/
public static ByteBuffer encodeCredentials(Credentials credentials) {
    try {
        DataOutputBuffer out = new DataOutputBuffer();
        credentials.writeTokenStorageToStream(out);
        return ByteBuffer.wrap(out.getData(), 0, out.getLength());
    } catch (IOException e) {
        // Shouldn't throw
        LOG.error("Failed to encode Credentials.", e);
        throw Throwables.propagate(e);
    }
}

From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java

License:Open Source License

private static void compareKeys(RawComparator<TextBytes> comparator, TextBytes key1, TextBytes key2,
        int expectedResult) {
    long nanoStart = System.nanoTime();
    Assert.assertEquals(comparator.compare(key1, key2), expectedResult);
    long nanoEnd = System.nanoTime();
    System.out.println("Object Comparison Took:" + (nanoEnd - nanoStart));
    DataOutputBuffer outputBuffer1 = new DataOutputBuffer();
    DataOutputBuffer outputBuffer2 = new DataOutputBuffer();
    try {//from  ww  w.  ja v a  2s .co  m
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        nanoStart = System.nanoTime();
        Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(),
                outputBuffer2.getData(), 0, outputBuffer2.getLength()), expectedResult);
        nanoEnd = System.nanoTime();
        System.out.println("Raw Comparison Took:" + (nanoEnd - nanoStart));
        int offset1 = outputBuffer1.getLength();
        int offset2 = outputBuffer2.getLength();
        key1.write(outputBuffer1);
        key2.write(outputBuffer2);
        Assert.assertEquals(
                comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1,
                        outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2),
                expectedResult);

        if (comparator instanceof LinkKeyComparator) {
            DataInputBuffer inputStream1 = new DataInputBuffer();
            DataInputBuffer inputStream2 = new DataInputBuffer();

            inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength());
            inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength());

            CrawlDBKey cdbkey1 = new CrawlDBKey();
            CrawlDBKey cdbkey2 = new CrawlDBKey();

            cdbkey1.readFields(inputStream1);
            cdbkey2.readFields(inputStream2);

            CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator();
            System.out.println("*Comparing Using CrawlDBKey Comparator");
            nanoStart = System.nanoTime();
            Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2), expectedResult);
            nanoEnd = System.nanoTime();
            System.out.println("Typed Comparison Took:" + (nanoEnd - nanoStart));

        }

    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.commoncrawl.service.crawler.CrawlerEngine.java

License:Open Source License

FlexBuffer getActiveHostListAsBuffer() throws IOException {
    if (_crawlActive && _httpCrawlQueue != null) {

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        Set<Integer> ipAddressSet = _httpCrawlQueue.getActiveHostIPs();

        WritableUtils.writeVInt(outputBuffer, ipAddressSet.size());

        for (int hostIP : ipAddressSet) {
            WritableUtils.writeVInt(outputBuffer, hostIP);
        }/*  ww  w . j a va2  s .  c om*/

        return new FlexBuffer(outputBuffer.getData(), 0, outputBuffer.getLength());
    }
    return null;
}

From source file:org.commoncrawl.service.crawler.CrawlLog.java

License:Open Source License

private void flushLog(final FlushCompletionCallback completionCallback) {
    if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH:Collecting Entries....");
    // set flush in progress indicator ...
    setFlushInProgress(true);/*w w w .  j a v  a 2s . c  o  m*/
    // and collect buffers in async thread context (thus not requiring
    // synchronization)
    final LinkedList<CrawlSegmentLog.LogItemBuffer> collector = new LinkedList<CrawlSegmentLog.LogItemBuffer>();
    // flush robots log
    _robotsSegment.flushLog(collector);
    // walk segments collecting log items ....
    for (CrawlSegmentLog logger : _loggers.values()) {
        // flush any log items into the collector
        logger.flushLog(collector);
    }
    if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH:Collection Returned " + collector.size() + " Buffers");

    // walk collector list identifying the list of unique segment ids
    final Set<Long> packedSegmentIdSet = new HashSet<Long>();

    int urlItemCount = 0;

    for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
        if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
            packedSegmentIdSet.add(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
        }
        urlItemCount += buffer.getItemCount();
    }

    if (Environment.detailLogEnabled())
        LOG.info("LOG_FLUSH:There are  " + urlItemCount + " Items in Flush Buffer Associated With "
                + packedSegmentIdSet.size() + " Segments");

    final File crawlLogFile = getActivePath(_rootDirectory);

    // now check to see if there is anything to do ...
    if (collector.size() != 0) {
        if (Environment.detailLogEnabled())
            LOG.info("LOG_FLUSH: Collector Size is NOT Zero... Starting Log Flusher Thread");
        // ok ... time to spawn a thread to do the blocking flush io
        _threadPool.submit(new ConcurrentTask<Boolean>(_eventLoop,

                new Callable<Boolean>() {

                    public Boolean call() throws Exception {

                        if (Environment.detailLogEnabled())
                            LOG.info("LOG_FLUSH: Log Flusher Thread Started");
                        long startTime = System.currentTimeMillis();

                        Map<Long, DataOutputStream> streamsMapByPackedId = new HashMap<Long, DataOutputStream>();
                        Map<Long, Integer> recordCountsByPackedId = new HashMap<Long, Integer>();

                        long crawlLogRecordCount = 0;

                        // open the actual crawler log file ...
                        final DataOutputStream crawlLogStream = new DataOutputStream(
                                new FileOutputStream(crawlLogFile, true));

                        try {
                            if (Environment.detailLogEnabled())
                                LOG.info(
                                        "LOG_FLUSH: Log Flusher Thread Opening Streams for Segments in Buffer");
                            // now open a set of file descriptors related to the identified
                            // segments
                            for (long packedSegmentId : packedSegmentIdSet) {
                                // construct the unique filename for the given log file...
                                File activeSegmentLog = CrawlSegmentLog.buildActivePath(_rootDirectory,
                                        getListIdFromLogId(packedSegmentId),
                                        getSegmentIdFromLogId(packedSegmentId));
                                // initialize the segment log ...
                                CrawlSegmentLog.initializeLogFile(activeSegmentLog);
                                // initialize record counts per stream ...
                                recordCountsByPackedId.put(packedSegmentId,
                                        CrawlSegmentLog.readerHeader(activeSegmentLog));
                                // and open an output stream for the specified log file ...
                                streamsMapByPackedId.put(packedSegmentId,
                                        new DataOutputStream(new FileOutputStream(activeSegmentLog, true)));
                            }

                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Log Flusher Thread Walking Items in Buffer");

                            // initialize a total item count variable
                            int totalItemCount = 0;

                            // crawl history stream
                            DataOutputBuffer historyStream = new DataOutputBuffer();

                            // and now walk log buffers ...
                            for (CrawlSegmentLog.LogItemBuffer buffer : collector) {
                                if (Environment.detailLogEnabled())
                                    LOG.info("LOG_FLUSH: Log Flusher Thread Writing " + buffer.getItemCount()
                                            + " Entries for Segment:" + buffer.getSegmentId());

                                // output stream
                                DataOutputStream segmentLogStream = null;

                                if (buffer.getListId() != -1 && buffer.getSegmentId() != -1) {
                                    // update segment count first ...
                                    recordCountsByPackedId.put(
                                            makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()),
                                            recordCountsByPackedId.get(
                                                    makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()))
                                                    + buffer.getItemCount());
                                    // get output stream associated with segment id
                                    segmentLogStream = streamsMapByPackedId
                                            .get(makeSegmentLogId(buffer.getListId(), buffer.getSegmentId()));
                                }

                                // and our local record counter ...
                                crawlLogRecordCount += buffer.getItemCount();

                                // and next do the actual disk flush ...
                                totalItemCount += buffer.flushToDisk(totalItemCount,

                                        new CrawlSegmentLog.LogItemBuffer.CrawlURLWriter() {

                                            SyncedCrawlURLLogWriter syncedLogWriter = new SyncedCrawlURLLogWriter();

                                            public void writeItem(CrawlURL url) throws IOException {
                                                // log it
                                                logCrawlLogWrite(url, url.getContentSize());
                                                // write it
                                                syncedLogWriter.writeItem(crawlLogStream, url);
                                            }

                                            public void writeItemCount(int entryCount) throws IOException {
                                            }

                                        }, segmentLogStream, historyStream);
                            }

                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Log Flusher Finished Writing Entries To Disk");
                            collector.clear();

                        } catch (IOException e) {
                            LOG.error("Critical Exception during Crawl Log Flush:"
                                    + CCStringUtils.stringifyException(e));
                            throw e;
                        } finally {
                            if (crawlLogStream != null) {
                                crawlLogStream.flush();
                                crawlLogStream.close();
                            }

                            for (DataOutputStream stream : streamsMapByPackedId.values()) {
                                if (stream != null)
                                    stream.flush();
                                stream.close();
                            }
                        }
                        // at this point... update the crawl log header ...
                        try {
                            if (Environment.detailLogEnabled())
                                LOG.info("LOG_FLUSH: Updating Log File Headers");
                            // update the log file header
                            updateLogFileHeader(crawlLogFile, _header, crawlLogRecordCount);
                            // and update each completion log header ...
                            for (long packedSegmentId : recordCountsByPackedId.keySet()) {
                                File activeSegmentLogPath = CrawlSegmentLog.buildActivePath(_rootDirectory,
                                        getListIdFromLogId(packedSegmentId),
                                        getSegmentIdFromLogId(packedSegmentId));
                                CrawlSegmentLog.writeHeader(activeSegmentLogPath,
                                        recordCountsByPackedId.get(packedSegmentId));
                            }
                        } catch (IOException e) {
                            LOG.error("Criticial Exception during Crawl Log Fluhs:"
                                    + CCStringUtils.stringifyException(e));
                            throw e;
                        } finally {

                        }

                        long endTime = System.currentTimeMillis();

                        _flushTimeAVG.addSample((double) endTime - startTime);
                        _flushTimeSmoothed.addSample((double) endTime - startTime);
                        _lastFlushTime = endTime - startTime;

                        LOG.info("LOG_FLUSH: Log Flusher Flushed Successfully");
                        return true;
                    }
                },

                new CompletionCallback<Boolean>() {

                    public void taskComplete(Boolean updateResult) {
                        setFlushInProgress(false);
                        if (completionCallback != null) {
                            completionCallback.flushComplete();
                        }
                    }

                    public void taskFailed(Exception e) {

                        setFlushInProgress(false);

                        if (completionCallback != null) {
                            completionCallback.flushFailed(e);
                        }

                        // all failures are critical in this particular task ...
                        LOG.fatal("Crawl Log FLUSH Threw Exception:" + CCStringUtils.stringifyException(e));

                        // no matter ... it is time to CORE the server ...
                        throw new RuntimeException("CRITICAL FAILURE: Crawl Log FLUSH Threw Exception:"
                                + CCStringUtils.stringifyException(e));

                    }
                }));
    } else {
        setFlushInProgress(false);
        if (completionCallback != null) {
            completionCallback.flushComplete();
        }
    }
}

From source file:org.commoncrawl.service.crawler.SegmentLoader.java

License:Open Source License

@SuppressWarnings("unchecked")
public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId, int segmentId, String crawlerName,
        CancelOperationCallback cancelCallback) throws IOException {

    CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap();

    WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");

    // construct hdfs path to segment ... 
    Path hdfsPath;/*  w  ww .  jav  a  2 s  .c  om*/
    if (segmentId != -1)
        hdfsPath = new Path(
                CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/");
    else
        hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/");

    Path workUnitDetailPath = new Path(hdfsPath, crawlerName);

    SequenceFile.Reader reader = null;

    try {
        FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
        reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig());

        LongWritable hostFP = new LongWritable();
        CrawlSegmentHost segmentHost = new CrawlSegmentHost();

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        int segmentUrlCount = 0;
        while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) {
            // and update url count ... 
            segmentUrlCount += segmentHost.getUrlTargets().size();

            // set the url vector to the appropriate size ... 
            for (CrawlSegmentURL url : segmentHost.getUrlTargets()) {

                WritableUtils.writeVLong(outputBuffer, segmentHost.getHostFP());
                WritableUtils.writeVLong(outputBuffer, url.getUrlFP());
            }
        }
        outputBuffer.flush();
        // ok set the urlfp stream 
        fpMap.setURLFPBuffer(segmentUrlCount, outputBuffer.getData(), outputBuffer.getLength());
        // now initialize the 

        if (cancelCallback.cancelOperation()) {
            return null;
        } else {
            return fpMap;
        }
    } finally {
        if (reader != null)
            reader.close();
    }
}

From source file:org.commoncrawl.service.listcrawler.CacheWriterThread.java

License:Open Source License

@Override
public void run() {

    boolean shutdown = false;

    while (!shutdown) {
        try {//w w w  . j  av a2s. c om
            final CacheWriteRequest request = _writeRequestQueue.take();

            switch (request._requestType) {

            case ExitThreadRequest: {
                // shutdown condition ... 
                CacheManager.LOG.info("Disk Writer Thread Received Shutdown. Exiting!");
                shutdown = true;
            }
                break;

            case WriteRequest: {

                long timeStart = System.currentTimeMillis();

                try {
                    // reset crc calculator (single thread so no worries on synchronization)
                    _crc32Out.reset();

                    // figure out if we need to compress the item ... 
                    if ((request._item.getFlags() & CacheItem.Flags.Flag_IsCompressed) == 0
                            && request._item.getContent().getCount() != 0) {
                        LOG.info("Incoming Cache Request Content for:" + request._item.getUrl()
                                + " is not compressed. Compressing...");
                        ByteStream compressedBytesOut = new ByteStream(request._item.getContent().getCount());
                        ThriftyGZIPOutputStream gzipOutputStream = new ThriftyGZIPOutputStream(
                                compressedBytesOut);
                        gzipOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0,
                                request._item.getContent().getCount());
                        gzipOutputStream.finish();
                        LOG.info("Finished Compressing Incoming Content for:" + request._item.getUrl()
                                + " BytesIn:" + request._item.getContent().getCount() + " BytesOut:"
                                + compressedBytesOut.size());
                        // replace buffer

                        request._item.setContent(
                                new FlexBuffer(compressedBytesOut.getBuffer(), 0, compressedBytesOut.size()));
                        request._item.setFlags((request._item.getFlags() | CacheItem.Flags.Flag_IsCompressed));
                    }

                    // create streams ...
                    ByteStream bufferOutputStream = new ByteStream(8192);

                    CheckedOutputStream checkedStream = new CheckedOutputStream(bufferOutputStream, _crc32Out);
                    DataOutputStream dataOutputStream = new DataOutputStream(checkedStream);

                    // remember if this item has content ... 
                    boolean hasContent = request._item.isFieldDirty(CacheItem.Field_CONTENT);
                    // now mark the content field as clean, so that it will not be serialized in our current serialization attempt ... 
                    request._item.setFieldClean(CacheItem.Field_CONTENT);
                    // and go ahead and write out the data to the intermediate buffer while also computing partial checksum 
                    request._item.write(dataOutputStream);

                    request._item.setFieldDirty(CacheItem.Field_CONTENT);

                    // ok, now ... write out file header ... 
                    CacheItemHeader itemHeader = new CacheItemHeader(_manager.getLocalLogSyncBytes());

                    itemHeader._status = CacheItemHeader.STATUS_ALIVE;
                    itemHeader._lastAccessTime = System.currentTimeMillis();
                    itemHeader._fingerprint = request._itemFingerprint;
                    // compute total length ... 

                    // first the header bytes in the cacheItem 
                    itemHeader._dataLength = bufferOutputStream.size();
                    // next the content length (encoded - as in size + bytes) ... 
                    itemHeader._dataLength += 4 + request._item.getContent().getCount();
                    // lastly the crc value iteself ... 
                    itemHeader._dataLength += 8;
                    // open the log file ... 
                    DataOutputBuffer logStream = new DataOutputBuffer();

                    // ok, go ahead and write the header 
                    itemHeader.writeHeader(logStream);
                    // ok now write out the item data minus content... 
                    logStream.write(bufferOutputStream.getBuffer(), 0, bufferOutputStream.size());
                    // now create a checked stream for the content ... 
                    CheckedOutputStream checkedStream2 = new CheckedOutputStream(logStream,
                            checkedStream.getChecksum());

                    dataOutputStream = new DataOutputStream(checkedStream2);

                    // content size 
                    dataOutputStream.writeInt(request._item.getContent().getCount());
                    // now write out the content (via checked stream so that we can calc checksum on content)
                    dataOutputStream.write(request._item.getContent().getReadOnlyBytes(), 0,
                            request._item.getContent().getCount());
                    // ok ... lastly write out the checksum bytes ... 
                    dataOutputStream.writeLong(checkedStream2.getChecksum().getValue());
                    // and FINALLY, write out the total item bytes (so that we can seek in reverse to read last request log 
                    logStream.writeInt(CacheItemHeader.SIZE + itemHeader._dataLength);

                    // ok flush everyting to the memory stream 
                    dataOutputStream.flush();

                    //ok - time to acquire the log semaphore 
                    //LOG.info("Acquiring Local Log Semaphore");
                    _manager.getLocalLogAccessSemaphore().acquireUninterruptibly();

                    try {

                        // now time to acquire the write semaphore ... 
                        _manager.getLocalLogWriteAccessSemaphore().acquireUninterruptibly();

                        // get the current file position 
                        long recordOffset = _manager.getLocalLogFilePos();

                        try {

                            long ioTimeStart = System.currentTimeMillis();

                            RandomAccessFile logFile = new RandomAccessFile(_manager.getActiveLogFilePath(),
                                    "rw");

                            try {
                                // seek to our known record offset 
                                logFile.seek(recordOffset);
                                // write out the data
                                logFile.write(logStream.getData(), 0, logStream.getLength());
                            } finally {
                                logFile.close();
                            }
                            // now we need to update the file header 
                            _manager.updateLogFileHeader(_manager.getActiveLogFilePath(), 1,
                                    CacheItemHeader.SIZE + itemHeader._dataLength + 4 /*trailing bytes*/);

                            CacheManager.LOG
                                    .info("#### Wrote Cache Item in:" + (System.currentTimeMillis() - timeStart)
                                            + " iotime:" + (System.currentTimeMillis() - ioTimeStart)
                                            + " QueueSize:" + _writeRequestQueue.size());

                        } finally {
                            // release write semaphore quickly 
                            _manager.getLocalLogWriteAccessSemaphore().release();
                        }

                        // now inform the manager of the completed request ... 
                        _manager.writeRequestComplete(request, recordOffset);
                    } finally {
                        //LOG.info("Releasing Local Log Semaphore");
                        _manager.getLocalLogAccessSemaphore().release();
                    }
                } catch (IOException e) {
                    CacheManager.LOG.error("### FUC# BATMAN! - GONNA LOSE THIS REQUEST!!!!:"
                            + CCStringUtils.stringifyException(e));
                    _manager.writeRequestFailed(request, e);
                }
            }
                break;
            }
        } catch (InterruptedException e) {

        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void cacheCrawlHistoryLog(File localCacheDir, long timestamp) throws IOException {

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    File cacheFilePath = new File(localCacheDir, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);

    SequenceFile.Reader indexReader = new SequenceFile.Reader(_remoteFileSystem, dataFilePath,
            CrawlEnvironment.getHadoopConfig());

    ValueBytes valueBytes = indexReader.createValueBytes();
    DataOutputBuffer keyBytes = new DataOutputBuffer();
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataOutputBuffer finalOutputStream = new DataOutputBuffer();
    DataOutputBuffer uncompressedValueBytes = new DataOutputBuffer();
    URLFP fp = new URLFP();

    try {/*from w  w w.j a  v a 2  s  . c  o m*/
        while (indexReader.nextRaw(keyBytes, valueBytes) != -1) {

            keyBuffer.reset(keyBytes.getData(), 0, keyBytes.getLength());
            // read fingerprint ...
            fp.readFields(keyBuffer);
            // write hash only
            finalOutputStream.writeLong(fp.getUrlHash());
            uncompressedValueBytes.reset();
            // write value bytes to intermediate buffer ...
            valueBytes.writeUncompressedBytes(uncompressedValueBytes);
            // write out uncompressed length
            WritableUtils.writeVInt(finalOutputStream, uncompressedValueBytes.getLength());
            // write out bytes
            finalOutputStream.write(uncompressedValueBytes.getData(), 0, uncompressedValueBytes.getLength());
        }
        // delete existing ...
        cacheFilePath.delete();
        // compute crc ...
        CRC32 crc = new CRC32();
        crc.update(finalOutputStream.getData(), 0, finalOutputStream.getLength());
        // open final output stream
        DataOutputStream fileOutputStream = new DataOutputStream(
                new BufferedOutputStream(new FileOutputStream(cacheFilePath)));

        try {
            fileOutputStream.writeLong(crc.getValue());
            fileOutputStream.write(finalOutputStream.getData(), 0, finalOutputStream.getLength());
            fileOutputStream.flush();
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            fileOutputStream.close();
            fileOutputStream = null;
            cacheFilePath.delete();
            throw e;
        } finally {
            if (fileOutputStream != null) {
                fileOutputStream.close();
            }
        }
    } finally {
        if (indexReader != null) {
            indexReader.close();
        }
    }
}

From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java

License:Open Source License

private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria,
        ItemUpdater targetList) throws IOException {

    // ok copy stuff locally if possible ...
    File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index");
    File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data");
    File localBloomFilterPath = new File(getLocalDataDir(),
            CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom");

    SequenceFile.Reader reader = null;
    Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp);
    Path indexFilePath = new Path(mapFilePath, "index");
    Path dataFilePath = new Path(mapFilePath, "data");
    Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp);

    // ok copy local first
    if (!localIndexPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:"
                + localIndexPath.getAbsolutePath());
        try {//from   w w w.j  a v a 2  s  .  c o  m
            _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localIndexPath.delete();
            throw e;
        }
    }
    if (!localDataPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:"
                + localDataPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localDataPath.delete();
            throw e;
        }

    }
    if (!localBloomFilterPath.exists()) {
        LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:"
                + localBloomFilterPath.getAbsolutePath());
        try {
            _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath()));
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            localBloomFilterPath.delete();
            throw e;
        }

    }

    // ok open local
    FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig());

    SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem,
            new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig());

    try {
        URLFP firstIndexKey = null;
        URLFP lastIndexKey = new URLFP();
        LongWritable position = new LongWritable();
        while (indexReader.next(lastIndexKey, position)) {
            if (firstIndexKey == null) {
                try {
                    firstIndexKey = (URLFP) lastIndexKey.clone();
                } catch (CloneNotSupportedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:"
                + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:"
                + lastIndexKey.getUrlHash());

        URLFP criteriaFirstKey = criteria.first();
        URLFP criteriaLastKey = criteria.last();

        if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) {
            LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!");
            LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash()
                    + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:"
                    + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash());
            return;
        }
    } finally {
        indexReader.close();
    }

    LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan");
    // load bloom filter
    FSDataInputStream bloomFilterStream = localFileSystem
            .open(new Path(localBloomFilterPath.getAbsolutePath()));

    int hitCount = 0;

    try {
        URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream);

        URLFP fpOut = new URLFP();
        ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem();
        DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer();
        ValueBytes valueBytes = null;
        DataInputBuffer valueReader = new DataInputBuffer();
        DataOutputBuffer keyBytes = new DataOutputBuffer();
        DataInputBuffer keyReader = new DataInputBuffer();

        URLFP lastFP = null;

        outerLoop:
        // now iterate each item in the criteria
        for (URLFP targetFP : criteria) {
            // if fingerprint is present in filter ...
            if (filter.isPresent(targetFP)) {
                // check to see if reader is initialzied ...
                if (reader == null) {
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()),
                            CrawlEnvironment.getHadoopConfig());
                    LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:"
                            + localDataPath.getAbsolutePath());
                    valueBytes = reader.createValueBytes();
                }

                // if last read fingerprint was not null ...
                if (lastFP != null) {
                    // does it match the current item
                    if (lastFP.compareTo(targetFP) == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);
                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash()
                                + " File:" + dataFilePath);
                        // if so, null out last fp
                        lastFP = null;
                        // and update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;

                        continue;
                    }
                }

                // ok at this point .. read the next item in the list ...
                lastFP = null;

                while (reader.nextRaw(keyBytes, valueBytes) != -1) {
                    // init reader ...
                    keyReader.reset(keyBytes.getData(), keyBytes.getLength());
                    // read key
                    fpOut.readFields(keyReader);
                    // reset output buffer
                    keyBytes.reset();

                    // LOG.info("LIST:" + listId +" nextRaw Returned DH:" +
                    // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" +
                    // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash());
                    // compare it to target ...
                    int result = fpOut.compareTo(targetFP);
                    // ok does it match .. ?
                    if (result == 0) {
                        // decompress value bytes ...
                        valueBytesUncompressed.reset();
                        valueBytes.writeUncompressedBytes(valueBytesUncompressed);
                        // init valueReader
                        valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength());
                        itemOut.readFields(valueReader);

                        LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash()
                                + " File:" + dataFilePath);
                        // update item state ...
                        targetList.updateItemState(targetFP, itemOut);

                        hitCount++;
                        // and break to outer loop
                        continue outerLoop;
                    } else if (result == 1) {
                        // LOG.info("LIST:" + listId +
                        // " FP Comparison Returned 1. Going to OuterLoop");
                        // update last FP
                        lastFP = fpOut;
                        // continue outer loop
                        continue outerLoop;
                    } else {
                        // otherwise skip
                    }
                }
                // ok if we got here .. we are done reading the sequence file and did
                // not find a trailing match
                LOG.warn("LIST:" + listId
                        + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:"
                        + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")");
                // break out of outer loop

                break;
            }
        }
    } finally {
        bloomFilterStream.close();

        if (reader != null) {
            reader.close();
        }

        LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount);
    }
}