Example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer.

Prototype

public DataOutputBuffer()

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.commoncrawl.service.parser.client.Dispatcher.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    CrawlEnvironment.setHadoopConfig(conf);
    String baseURL = "http://unknown.com/";
    if (args.length != 0) {
        baseURL = args[0];//  w w w.  j  av a  2  s  .c o  m
    }
    URL baseURLObj;
    try {
        baseURLObj = new URL(baseURL);
    } catch (MalformedURLException e2) {
        throw new IOException("Invalid Base Link");
    }
    final URL finalBaseURL = (baseURLObj != null) ? baseURLObj : null;
    final DataOutputBuffer headerBuffer = new DataOutputBuffer();
    final DataOutputBuffer contentBuffer = new DataOutputBuffer();

    try {
        ByteStreams.readBytes(new InputSupplier<InputStream>() {

            @Override
            public InputStream getInput() throws IOException {
                return System.in;
            }
        }, new ByteProcessor<Long>() {

            @Override
            public Long getResult() {
                return 0L;
            }

            int currLineCharCount = 0;
            boolean processingHeaders = true;

            @Override
            public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                if (processingHeaders) {
                    int current = start;
                    int end = current + length;
                    while (processingHeaders && current != end) {
                        if (buf[current] != '\r' && buf[current] != '\n') {
                            currLineCharCount++;
                        } else if (buf[current] == '\n') {
                            if (currLineCharCount == 0) {
                                headerBuffer.write(buf, start, current - start + 1);
                                processingHeaders = false;
                            }
                            currLineCharCount = 0;
                        }
                        current++;
                    }
                    if (processingHeaders) {
                        headerBuffer.write(buf, start, length);
                    } else {
                        length -= current - start;
                        start = current;
                    }
                }
                if (!processingHeaders) {
                    contentBuffer.write(buf, start, length);
                }
                return true;
            }
        });

        LOG.info("HEADER LEN:" + headerBuffer.getLength());
        // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")));
        LOG.info("CONTENT LEN:" + contentBuffer.getLength());
        //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
        // decode header bytes ... 
        String header = "";
        if (headerBuffer.getLength() != 0) {
            try {
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("UTF-8"));
            } catch (Exception e) {
                LOG.warn(CCStringUtils.stringifyException(e));
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("ASCII"));
            }
        }
        final String headersFinal = (header != null) ? header : "";

        LOG.info("Starting Event Loop");
        final EventLoop eventLoop = new EventLoop();
        eventLoop.start();

        try {
            // create fake hosts file ...  
            //String hosts = "10.0.20.101:8072";
            // reader 
            //Reader reader = new StringReader(hosts);
            // dispatcher init 
            LOG.info("initializing Dispatcher");
            final Dispatcher dispatcher = new Dispatcher(eventLoop, "parserNodes");
            LOG.info("Waiting for a few seconds");
            Thread.sleep(5000);
            Thread threads[] = new Thread[TEST_THREAD_COUNT];
            final Semaphore threadWaitSem = new Semaphore(-TEST_THREAD_COUNT - 1);
            // start 100 threads 
            for (int threadIdx = 0; threadIdx < TEST_THREAD_COUNT; ++threadIdx) {
                threads[threadIdx] = new Thread(new Runnable() {

                    @Override
                    public void run() {
                        for (int i = 0; i < ITERATIONS_PER_THREAD; ++i) {
                            // build parse request 
                            ParseRequest request = new ParseRequest();
                            request.setDocId(1);
                            request.setDomainId(1);
                            request.setDocURL(finalBaseURL.toString());
                            request.setDocHeaders(headersFinal);
                            request.setDocContent(
                                    new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
                            //LOG.info("Dispatching parse request");
                            ParseResult result = dispatcher.dispatchRequest(request);
                            LOG.info("TID[" + Thread.currentThread().getId() + "]ReqID[" + i + "]" + " Success:"
                                    + ((result != null) ? result.getParseSuccessful() : false) + " LinkCount:"
                                    + ((result != null) ? result.getExtractedLinks().size() : 0));
                        }
                        LOG.info("Thread:" + Thread.currentThread().getId() + " Exiting");
                        threadWaitSem.release();
                    }

                });
                threads[threadIdx].start();
            }

            LOG.info("Waiting for threads to die");
            threadWaitSem.acquireUninterruptibly();
            LOG.info("All Threads dead.");

        } finally {
            eventLoop.stop();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    } catch (InterruptedException e) {
    }
}

From source file:org.commoncrawl.service.parser.server.ParseWorker.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String baseURL = "http://unknown.com/";
    if (args.length != 0) {
        baseURL = args[0];//from   ww w  .j  a  v a 2  s  .com
    }
    URL baseURLObj;
    try {
        baseURLObj = new URL(baseURL);
    } catch (MalformedURLException e2) {
        throw new IOException("Invalid Base Link");
    }
    final DataOutputBuffer headerBuffer = new DataOutputBuffer();
    final DataOutputBuffer contentBuffer = new DataOutputBuffer();

    try {
        ByteStreams.readBytes(new InputSupplier<InputStream>() {

            @Override
            public InputStream getInput() throws IOException {
                return System.in;
            }
        }, new ByteProcessor<Long>() {

            @Override
            public Long getResult() {
                return 0L;
            }

            int currLineCharCount = 0;
            boolean processingHeaders = true;

            @Override
            public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                if (processingHeaders) {
                    int current = start;
                    int end = current + length;
                    while (processingHeaders && current != end) {
                        if (buf[current] != '\r' && buf[current] != '\n') {
                            currLineCharCount++;
                        } else if (buf[current] == '\n') {
                            if (currLineCharCount == 0) {
                                headerBuffer.write(buf, start, current - start + 1);
                                processingHeaders = false;
                            }
                            currLineCharCount = 0;
                        }
                        current++;
                    }
                    if (processingHeaders) {
                        headerBuffer.write(buf, start, length);
                    } else {
                        length -= current - start;
                        start = current;
                    }
                }
                if (!processingHeaders) {
                    contentBuffer.write(buf, start, length);
                }
                return true;
            }
        });

        //LOG.info("HEADER LEN:" + headerBuffer.getLength());
        // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")));
        //LOG.info("CONTENT LEN:" + contentBuffer.getLength());
        //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
        // decode header bytes ... 
        String header = "";
        if (headerBuffer.getLength() != 0) {
            try {
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("UTF-8"));
            } catch (Exception e) {
                LOG.warn(CCStringUtils.stringifyException(e));
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("ASCII"));
            }
        }

        //LOG.info("Parsing Document");
        ParseWorker worker = new ParseWorker();
        ParseResult result = new ParseResult();
        worker.parseDocument(result, 0L, 0L, baseURLObj, header,
                new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
        //LOG.info("Parse Result:" + result.getParseSuccessful()); 
        //LOG.info("Parse Data:" + result.toString());  

    } catch (IOException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    }
    /*
    List<String> lines;
    try {
      lines = IOUtils.readLines(System.in, "UTF-8");
      for (String line : lines){ 
        System.out.println(line);
      }
            
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }*/
}

From source file:org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex.java

License:Open Source License

public void seekReaderToItemAtIndex(SequenceFile.Reader reader, long desiredIndexPos) throws IOException {
    IndexItem indexItem = findIndexDataPosForItemIndex(desiredIndexPos);
    if (indexItem == null) {
        throw new IOException("Invalid Index Position:" + desiredIndexPos);
    }//from www. j a v a2 s.c  o  m

    //LOG.info("Seeking to appropriate position in file");
    long timeStart = System.currentTimeMillis();
    reader.seek(indexItem._offsetValue);
    //LOG.info("Seek Took:" + (System.currentTimeMillis() - timeStart));

    DataOutputBuffer skipBuffer = new DataOutputBuffer() {
        @Override
        public void write(DataInput in, int length) throws IOException {
            in.skipBytes(length);
        }
    };

    timeStart = System.currentTimeMillis();

    int skipCount = 0;

    ValueBytes skipValue = reader.createValueBytes();

    long currentIndexPos = indexItem._indexValue;
    while (currentIndexPos < desiredIndexPos) {

        reader.nextRawKey(skipBuffer);
        reader.nextRawValue(skipValue);
        ++skipCount;
        ++currentIndexPos;
    }

    //LOG.info("Skip of:" + skipCount +" Values took:" + (System.currentTimeMillis() - timeStart));

}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId,
        long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException {

    File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
    tempFile.mkdir();//from w w  w. j av a 2 s  . c o  m

    try {
        // create the final output spill writer ...  
        SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>(
                outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)),
                true);

        try {

            MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>(
                    conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null,
                    new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null);

            try {

                for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) {
                    // 0. shard domain id to find index file location ... 
                    int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE)
                            % CrawlEnvironment.NUM_DB_SHARDS);
                    // build path to index file 
                    Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId
                            + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
                    LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:"
                            + indexFilePath);
                    // 1. scan domainFP to index file first
                    // 2. given index, scan index->pos file to find scan start position
                    // 3. given scan start position, scan forward until fp match is found.
                    // 4. collect all matching entries and output to a file ? 

                    FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
                    try {
                        TFile.Reader reader = new TFile.Reader(indexDataInputStream,
                                fs.getLength(indexFilePath), conf);
                        try {
                            TFile.Reader.Scanner scanner = reader.createScanner();

                            try {
                                // generate key ... 
                                DataOutputBuffer keyBuffer = new DataOutputBuffer();
                                keyBuffer.writeLong(targetRootDomainFP);
                                if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) {
                                    // setup for value scan 
                                    DataInputStream valueStream = scanner.entry().getValueStream();
                                    int dataOffsetOut = -1;
                                    while (valueStream.available() > 0) {
                                        // read entries looking for our specific entry
                                        int shardIdx = valueStream.readInt();
                                        int dataOffset = valueStream.readInt();
                                        if (shardIdx == targetShardId) {
                                            dataOffsetOut = dataOffset;
                                            break;
                                        }
                                    }
                                    LOG.info("Index Search Yielded:" + dataOffsetOut);
                                    if (dataOffsetOut != -1) {
                                        // ok create a data path 
                                        Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId));
                                        Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId) + ".index");
                                        // check to see if index is already loaded ... 
                                        PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
                                        synchronized (_shardToIndexMap) {
                                            index = _shardToIndexMap.get(targetShardId);
                                        }
                                        if (index == null) {
                                            LOG.info("Loading Index from Path:" + finalDataIndexPath);
                                            // load index
                                            index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(
                                                    fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
                                            // put in cache
                                            synchronized (_shardToIndexMap) {
                                                _shardToIndexMap.put(targetShardId, index);
                                            }
                                        }

                                        LOG.info("Initializing Data Reader at Path:" + finalDataPath);
                                        // ok time to create a reader 
                                        SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,
                                                finalDataPath, conf);

                                        try {
                                            LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
                                            index.seekReaderToItemAtIndex(dataReader, dataOffsetOut);

                                            FlexBuffer keyBytes = new FlexBuffer();
                                            URLFPV2 sourceFP = new URLFPV2();
                                            DataInputBuffer keyReader = new DataInputBuffer();
                                            TextBytes urlTxt = new TextBytes();

                                            // ok read to go ... 
                                            while (dataReader.next(keyBytes, sourceFP)) {
                                                // initialize reader 
                                                keyReader.reset(keyBytes.get(), keyBytes.getOffset(),
                                                        keyBytes.getCount());

                                                long targetFP = keyReader.readLong();

                                                if (targetRootDomainFP == targetFP) {
                                                    finalMerger.spillRecord(keyBytes, sourceFP);
                                                } else {
                                                    LOG.info("FP:" + targetFP + " > TargetFP:"
                                                            + targetRootDomainFP + " Exiting Iteration Loop");
                                                    break;
                                                }
                                            }
                                        } finally {
                                            LOG.info("Closing Reader");
                                            dataReader.close();
                                        }
                                    }
                                }
                            } finally {
                                LOG.info("Closing Scanner");
                                scanner.close();
                            }

                        } finally {
                            LOG.info("Closing TFile Reader");
                            reader.close();
                        }
                    } finally {
                        LOG.info("Closing InputStream");
                        indexDataInputStream.close();
                    }
                }
            } finally {
                finalMerger.close();
            }
        } finally {
            spillwriter.close();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        FileUtils.recursivelyDeleteFile(tempFile);
    }

}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem,
        Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath,
        Path inlinksDetailOutputPath) throws IOException {

    long recordCount = 0L;

    outputFileSystem.delete(inlinksDomainIndexPath);
    outputFileSystem.delete(inlinksDetailOutputPath);

    FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath);

    try {//www.  j  a  v a 2s  .  c o m

        FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath);
        FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath);

        ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>();

        try {

            LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath);
            CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream);

            InlinkingDomainInfo lastDomain = null;

            while (reader.hasNext()) {

                // read the nex fingerprint 
                URLFPV2 fingerprint = reader.next();
                // and first see if we have a domain transition 
                if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) {
                    // remember the domain 
                    lastDomain = new InlinkingDomainInfo();
                    lastDomain.setDomainId(fingerprint.getDomainHash());
                    // add it to the list 
                    domainList.add(lastDomain);
                    // update date position 
                    lastDomain.setUrlDataPos(detailOutputStream.getPos());
                }
                // increment url count for the domain
                lastDomain.setUrlCount(lastDomain.getUrlCount() + 1);

                detailOutputStream.writeLong(fingerprint.getDomainHash());
                detailOutputStream.writeLong(fingerprint.getUrlHash());

                recordCount++;
            }

            LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records");
            // ok, now resolve domain names
            for (InlinkingDomainInfo domain : domainList) {
                SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId());
                if (metadata == null) {
                    LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId());
                } else {
                    if (metadata.getDomainText().length() == 0) {
                        LOG.error("*** Metadata for Domain Id:" + domain.getDomainId()
                                + " contained NULL Name Value.");
                        domain.setDomainName("_ERROR:BAD RECORD");
                    } else {
                        domain.setDomainName(metadata.getDomainText());
                    }
                    //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount());
                }
            }

            LOG.info("Sorting Domain List of Size:" + domainList.size());
            // ok sort by domain name 
            Collections.sort(domainList);

            LOG.info("Building In Memory Index");

            // ok write out domain info
            DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer();
            DataOutputBuffer indexDataBuffer = new DataOutputBuffer();

            LOG.info("***Writing Domain List Size:" + domainList.size());
            indexHeaderBuffer.writeInt(domainList.size());

            // ok iterate and write to both buffers  
            for (InlinkingDomainInfo domain : domainList) {
                indexHeaderBuffer.writeInt(indexDataBuffer.getLength());
                domain.write(indexDataBuffer);
            }

            LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:"
                    + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength());
            // ok now flush both buffers to disk
            indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength());
            indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength());
        } finally {
            indexOutputStream.flush();
            indexOutputStream.close();
            detailOutputStream.flush();
            detailOutputStream.close();
        }
    } finally {
        remoteInputStream.close();
    }
    return recordCount;
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }//w ww.j av  a2  s.co m
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.commoncrawl.util.JoinByTextSortByTagMapper.java

License:Open Source License

public static void main(String[] args) throws IOException {
    DataOutputBuffer outputBuffer = new DataOutputBuffer();
    TextBytes textKey = new TextBytes("test");
    TextBytes tagValue = new TextBytes("tag");
    TextBytes textOut = new TextBytes();

    makeCompositeKey(outputBuffer, textKey, tagValue, textOut);
    System.out.println("CompositeKey:" + textOut.toString());

    TextBytes textKeyOut = new TextBytes();
    TextBytes tagValueOut = new TextBytes();

    getKeyFromCompositeKey(textOut, textKeyOut);
    getTagFromCompositeKey(textOut, tagValueOut);

    Assert.assertTrue(textKey.compareTo(textKeyOut) == 0);
    Assert.assertTrue(tagValue.compareTo(tagValueOut) == 0);
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem)
        throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;/*from www.  ja  va 2 s.  co  m*/
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (keyObject.compareTo(targetItem) == 0) {

            reader.nextRawKey(rawKey);
            URLFPV2 nextKeyObject = new URLFPV2();
            keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
            nextKeyObject.readFields(keyDataStream);
            LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash()
                    + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:"
                    + nextKeyObject.getUrlHash());
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet,
        int nItems) throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;/*from   w w  w . j a  v a 2s  .com*/
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        outputSet.add(keyObject);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (++i == nItems) {
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java

License:Apache License

/** 
 * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
 *///from ww w. j a  v a  2s .co m
@Test
public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try {
        // write the ARC File into memory 
        writeFirstRecord(os, "test", timestamp);
        List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            NIOHttpHeaders headers = new NIOHttpHeaders();
            for (int i = 0; i < record.headers.size(); ++i) {
                headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
            }

            write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                    MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
        }
        os.flush();
        os.close();

        final AtomicBoolean streamClosed = new AtomicBoolean();
        // setup ArcFileReader to read the file 
        InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

            public synchronized int read(byte b[], int off, int len) {
                len = 1;
                return super.read(b, off, len);
            }

            public void close() throws IOException {
                super.close();
                streamClosed.set(true);
            }
        };
        ARCFileReader reader = new ARCFileReader(in);
        int index = 0;
        Text key = new Text();
        BytesWritable value = new BytesWritable();

        // iterate and validate stuff ... 
        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            TestRecord testRecord = records.get(index++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(
                    compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            if (indexofHeaderTerminator == -1) {
                throw new IOException("No Header Terminator found in Value!");
            }
            indexofHeaderTerminator += 4;
            // read headers ... 
            String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                    Charset.forName("UTF-8"));
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
            for (int i = 0; i < testRecord.headers.size(); ++i) {
                Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
            }

            Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                    indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
        Assert.assertTrue(streamClosed.get());
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}