Example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer

List of usage examples for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer.

Prototype

public DataOutputBuffer() 

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.commoncrawl.service.parser.client.Dispatcher.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    CrawlEnvironment.setHadoopConfig(conf);
    String baseURL = "http://unknown.com/";
    if (args.length != 0) {
        baseURL = args[0];//  w w w.  j  av a  2  s  .c o  m
    }
    URL baseURLObj;
    try {
        baseURLObj = new URL(baseURL);
    } catch (MalformedURLException e2) {
        throw new IOException("Invalid Base Link");
    }
    final URL finalBaseURL = (baseURLObj != null) ? baseURLObj : null;
    final DataOutputBuffer headerBuffer = new DataOutputBuffer();
    final DataOutputBuffer contentBuffer = new DataOutputBuffer();

    try {
        ByteStreams.readBytes(new InputSupplier<InputStream>() {

            @Override
            public InputStream getInput() throws IOException {
                return System.in;
            }
        }, new ByteProcessor<Long>() {

            @Override
            public Long getResult() {
                return 0L;
            }

            int currLineCharCount = 0;
            boolean processingHeaders = true;

            @Override
            public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                if (processingHeaders) {
                    int current = start;
                    int end = current + length;
                    while (processingHeaders && current != end) {
                        if (buf[current] != '\r' && buf[current] != '\n') {
                            currLineCharCount++;
                        } else if (buf[current] == '\n') {
                            if (currLineCharCount == 0) {
                                headerBuffer.write(buf, start, current - start + 1);
                                processingHeaders = false;
                            }
                            currLineCharCount = 0;
                        }
                        current++;
                    }
                    if (processingHeaders) {
                        headerBuffer.write(buf, start, length);
                    } else {
                        length -= current - start;
                        start = current;
                    }
                }
                if (!processingHeaders) {
                    contentBuffer.write(buf, start, length);
                }
                return true;
            }
        });

        LOG.info("HEADER LEN:" + headerBuffer.getLength());
        // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")));
        LOG.info("CONTENT LEN:" + contentBuffer.getLength());
        //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
        // decode header bytes ... 
        String header = "";
        if (headerBuffer.getLength() != 0) {
            try {
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("UTF-8"));
            } catch (Exception e) {
                LOG.warn(CCStringUtils.stringifyException(e));
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("ASCII"));
            }
        }
        final String headersFinal = (header != null) ? header : "";

        LOG.info("Starting Event Loop");
        final EventLoop eventLoop = new EventLoop();
        eventLoop.start();

        try {
            // create fake hosts file ...  
            //String hosts = "10.0.20.101:8072";
            // reader 
            //Reader reader = new StringReader(hosts);
            // dispatcher init 
            LOG.info("initializing Dispatcher");
            final Dispatcher dispatcher = new Dispatcher(eventLoop, "parserNodes");
            LOG.info("Waiting for a few seconds");
            Thread.sleep(5000);
            Thread threads[] = new Thread[TEST_THREAD_COUNT];
            final Semaphore threadWaitSem = new Semaphore(-TEST_THREAD_COUNT - 1);
            // start 100 threads 
            for (int threadIdx = 0; threadIdx < TEST_THREAD_COUNT; ++threadIdx) {
                threads[threadIdx] = new Thread(new Runnable() {

                    @Override
                    public void run() {
                        for (int i = 0; i < ITERATIONS_PER_THREAD; ++i) {
                            // build parse request 
                            ParseRequest request = new ParseRequest();
                            request.setDocId(1);
                            request.setDomainId(1);
                            request.setDocURL(finalBaseURL.toString());
                            request.setDocHeaders(headersFinal);
                            request.setDocContent(
                                    new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
                            //LOG.info("Dispatching parse request");
                            ParseResult result = dispatcher.dispatchRequest(request);
                            LOG.info("TID[" + Thread.currentThread().getId() + "]ReqID[" + i + "]" + " Success:"
                                    + ((result != null) ? result.getParseSuccessful() : false) + " LinkCount:"
                                    + ((result != null) ? result.getExtractedLinks().size() : 0));
                        }
                        LOG.info("Thread:" + Thread.currentThread().getId() + " Exiting");
                        threadWaitSem.release();
                    }

                });
                threads[threadIdx].start();
            }

            LOG.info("Waiting for threads to die");
            threadWaitSem.acquireUninterruptibly();
            LOG.info("All Threads dead.");

        } finally {
            eventLoop.stop();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    } catch (InterruptedException e) {
    }
}

From source file:org.commoncrawl.service.parser.server.ParseWorker.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String baseURL = "http://unknown.com/";
    if (args.length != 0) {
        baseURL = args[0];//from   ww w  .j  a  v a 2  s  .com
    }
    URL baseURLObj;
    try {
        baseURLObj = new URL(baseURL);
    } catch (MalformedURLException e2) {
        throw new IOException("Invalid Base Link");
    }
    final DataOutputBuffer headerBuffer = new DataOutputBuffer();
    final DataOutputBuffer contentBuffer = new DataOutputBuffer();

    try {
        ByteStreams.readBytes(new InputSupplier<InputStream>() {

            @Override
            public InputStream getInput() throws IOException {
                return System.in;
            }
        }, new ByteProcessor<Long>() {

            @Override
            public Long getResult() {
                return 0L;
            }

            int currLineCharCount = 0;
            boolean processingHeaders = true;

            @Override
            public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                if (processingHeaders) {
                    int current = start;
                    int end = current + length;
                    while (processingHeaders && current != end) {
                        if (buf[current] != '\r' && buf[current] != '\n') {
                            currLineCharCount++;
                        } else if (buf[current] == '\n') {
                            if (currLineCharCount == 0) {
                                headerBuffer.write(buf, start, current - start + 1);
                                processingHeaders = false;
                            }
                            currLineCharCount = 0;
                        }
                        current++;
                    }
                    if (processingHeaders) {
                        headerBuffer.write(buf, start, length);
                    } else {
                        length -= current - start;
                        start = current;
                    }
                }
                if (!processingHeaders) {
                    contentBuffer.write(buf, start, length);
                }
                return true;
            }
        });

        //LOG.info("HEADER LEN:" + headerBuffer.getLength());
        // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8")));
        //LOG.info("CONTENT LEN:" + contentBuffer.getLength());
        //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
        // decode header bytes ... 
        String header = "";
        if (headerBuffer.getLength() != 0) {
            try {
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("UTF-8"));
            } catch (Exception e) {
                LOG.warn(CCStringUtils.stringifyException(e));
                header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                        Charset.forName("ASCII"));
            }
        }

        //LOG.info("Parsing Document");
        ParseWorker worker = new ParseWorker();
        ParseResult result = new ParseResult();
        worker.parseDocument(result, 0L, 0L, baseURLObj, header,
                new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
        //LOG.info("Parse Result:" + result.getParseSuccessful()); 
        //LOG.info("Parse Data:" + result.toString());  

    } catch (IOException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    }
    /*
    List<String> lines;
    try {
      lines = IOUtils.readLines(System.in, "UTF-8");
      for (String line : lines){ 
        System.out.println(line);
      }
            
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }*/
}

From source file:org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex.java

License:Open Source License

public void seekReaderToItemAtIndex(SequenceFile.Reader reader, long desiredIndexPos) throws IOException {
    IndexItem indexItem = findIndexDataPosForItemIndex(desiredIndexPos);
    if (indexItem == null) {
        throw new IOException("Invalid Index Position:" + desiredIndexPos);
    }//from www. j a v a2 s.c  o  m

    //LOG.info("Seeking to appropriate position in file");
    long timeStart = System.currentTimeMillis();
    reader.seek(indexItem._offsetValue);
    //LOG.info("Seek Took:" + (System.currentTimeMillis() - timeStart));

    DataOutputBuffer skipBuffer = new DataOutputBuffer() {
        @Override
        public void write(DataInput in, int length) throws IOException {
            in.skipBytes(length);
        }
    };

    timeStart = System.currentTimeMillis();

    int skipCount = 0;

    ValueBytes skipValue = reader.createValueBytes();

    long currentIndexPos = indexItem._indexValue;
    while (currentIndexPos < desiredIndexPos) {

        reader.nextRawKey(skipBuffer);
        reader.nextRawValue(skipValue);
        ++skipCount;
        ++currentIndexPos;
    }

    //LOG.info("Skip of:" + skipCount +" Values took:" + (System.currentTimeMillis() - timeStart));

}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId,
        long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException {

    File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
    tempFile.mkdir();//from w w  w. j av a 2 s  . c o  m

    try {
        // create the final output spill writer ...  
        SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>(
                outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)),
                true);

        try {

            MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>(
                    conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null,
                    new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null);

            try {

                for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) {
                    // 0. shard domain id to find index file location ... 
                    int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE)
                            % CrawlEnvironment.NUM_DB_SHARDS);
                    // build path to index file 
                    Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId
                            + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
                    LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:"
                            + indexFilePath);
                    // 1. scan domainFP to index file first
                    // 2. given index, scan index->pos file to find scan start position
                    // 3. given scan start position, scan forward until fp match is found.
                    // 4. collect all matching entries and output to a file ? 

                    FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
                    try {
                        TFile.Reader reader = new TFile.Reader(indexDataInputStream,
                                fs.getLength(indexFilePath), conf);
                        try {
                            TFile.Reader.Scanner scanner = reader.createScanner();

                            try {
                                // generate key ... 
                                DataOutputBuffer keyBuffer = new DataOutputBuffer();
                                keyBuffer.writeLong(targetRootDomainFP);
                                if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) {
                                    // setup for value scan 
                                    DataInputStream valueStream = scanner.entry().getValueStream();
                                    int dataOffsetOut = -1;
                                    while (valueStream.available() > 0) {
                                        // read entries looking for our specific entry
                                        int shardIdx = valueStream.readInt();
                                        int dataOffset = valueStream.readInt();
                                        if (shardIdx == targetShardId) {
                                            dataOffsetOut = dataOffset;
                                            break;
                                        }
                                    }
                                    LOG.info("Index Search Yielded:" + dataOffsetOut);
                                    if (dataOffsetOut != -1) {
                                        // ok create a data path 
                                        Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId));
                                        Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId) + ".index");
                                        // check to see if index is already loaded ... 
                                        PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
                                        synchronized (_shardToIndexMap) {
                                            index = _shardToIndexMap.get(targetShardId);
                                        }
                                        if (index == null) {
                                            LOG.info("Loading Index from Path:" + finalDataIndexPath);
                                            // load index
                                            index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(
                                                    fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
                                            // put in cache
                                            synchronized (_shardToIndexMap) {
                                                _shardToIndexMap.put(targetShardId, index);
                                            }
                                        }

                                        LOG.info("Initializing Data Reader at Path:" + finalDataPath);
                                        // ok time to create a reader 
                                        SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,
                                                finalDataPath, conf);

                                        try {
                                            LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
                                            index.seekReaderToItemAtIndex(dataReader, dataOffsetOut);

                                            FlexBuffer keyBytes = new FlexBuffer();
                                            URLFPV2 sourceFP = new URLFPV2();
                                            DataInputBuffer keyReader = new DataInputBuffer();
                                            TextBytes urlTxt = new TextBytes();

                                            // ok read to go ... 
                                            while (dataReader.next(keyBytes, sourceFP)) {
                                                // initialize reader 
                                                keyReader.reset(keyBytes.get(), keyBytes.getOffset(),
                                                        keyBytes.getCount());

                                                long targetFP = keyReader.readLong();

                                                if (targetRootDomainFP == targetFP) {
                                                    finalMerger.spillRecord(keyBytes, sourceFP);
                                                } else {
                                                    LOG.info("FP:" + targetFP + " > TargetFP:"
                                                            + targetRootDomainFP + " Exiting Iteration Loop");
                                                    break;
                                                }
                                            }
                                        } finally {
                                            LOG.info("Closing Reader");
                                            dataReader.close();
                                        }
                                    }
                                }
                            } finally {
                                LOG.info("Closing Scanner");
                                scanner.close();
                            }

                        } finally {
                            LOG.info("Closing TFile Reader");
                            reader.close();
                        }
                    } finally {
                        LOG.info("Closing InputStream");
                        indexDataInputStream.close();
                    }
                }
            } finally {
                finalMerger.close();
            }
        } finally {
            spillwriter.close();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        FileUtils.recursivelyDeleteFile(tempFile);
    }

}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem,
        Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath,
        Path inlinksDetailOutputPath) throws IOException {

    long recordCount = 0L;

    outputFileSystem.delete(inlinksDomainIndexPath);
    outputFileSystem.delete(inlinksDetailOutputPath);

    FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath);

    try {//www.  j  a  v a 2s  .  c o m

        FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath);
        FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath);

        ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>();

        try {

            LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath);
            CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream);

            InlinkingDomainInfo lastDomain = null;

            while (reader.hasNext()) {

                // read the nex fingerprint 
                URLFPV2 fingerprint = reader.next();
                // and first see if we have a domain transition 
                if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) {
                    // remember the domain 
                    lastDomain = new InlinkingDomainInfo();
                    lastDomain.setDomainId(fingerprint.getDomainHash());
                    // add it to the list 
                    domainList.add(lastDomain);
                    // update date position 
                    lastDomain.setUrlDataPos(detailOutputStream.getPos());
                }
                // increment url count for the domain
                lastDomain.setUrlCount(lastDomain.getUrlCount() + 1);

                detailOutputStream.writeLong(fingerprint.getDomainHash());
                detailOutputStream.writeLong(fingerprint.getUrlHash());

                recordCount++;
            }

            LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records");
            // ok, now resolve domain names
            for (InlinkingDomainInfo domain : domainList) {
                SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId());
                if (metadata == null) {
                    LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId());
                } else {
                    if (metadata.getDomainText().length() == 0) {
                        LOG.error("*** Metadata for Domain Id:" + domain.getDomainId()
                                + " contained NULL Name Value.");
                        domain.setDomainName("_ERROR:BAD RECORD");
                    } else {
                        domain.setDomainName(metadata.getDomainText());
                    }
                    //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount());
                }
            }

            LOG.info("Sorting Domain List of Size:" + domainList.size());
            // ok sort by domain name 
            Collections.sort(domainList);

            LOG.info("Building In Memory Index");

            // ok write out domain info
            DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer();
            DataOutputBuffer indexDataBuffer = new DataOutputBuffer();

            LOG.info("***Writing Domain List Size:" + domainList.size());
            indexHeaderBuffer.writeInt(domainList.size());

            // ok iterate and write to both buffers  
            for (InlinkingDomainInfo domain : domainList) {
                indexHeaderBuffer.writeInt(indexDataBuffer.getLength());
                domain.write(indexDataBuffer);
            }

            LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:"
                    + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength());
            // ok now flush both buffers to disk
            indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength());
            indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength());
        } finally {
            indexOutputStream.flush();
            indexOutputStream.close();
            detailOutputStream.flush();
            detailOutputStream.close();
        }
    } finally {
        remoteInputStream.close();
    }
    return recordCount;
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }//w ww.j av  a2  s.co m
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.commoncrawl.util.JoinByTextSortByTagMapper.java

License:Open Source License

public static void main(String[] args) throws IOException {
    DataOutputBuffer outputBuffer = new DataOutputBuffer();
    TextBytes textKey = new TextBytes("test");
    TextBytes tagValue = new TextBytes("tag");
    TextBytes textOut = new TextBytes();

    makeCompositeKey(outputBuffer, textKey, tagValue, textOut);
    System.out.println("CompositeKey:" + textOut.toString());

    TextBytes textKeyOut = new TextBytes();
    TextBytes tagValueOut = new TextBytes();

    getKeyFromCompositeKey(textOut, textKeyOut);
    getTagFromCompositeKey(textOut, tagValueOut);

    Assert.assertTrue(textKey.compareTo(textKeyOut) == 0);
    Assert.assertTrue(tagValue.compareTo(tagValueOut) == 0);
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem)
        throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;/*from www.  ja  va 2 s.  co  m*/
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (keyObject.compareTo(targetItem) == 0) {

            reader.nextRawKey(rawKey);
            URLFPV2 nextKeyObject = new URLFPV2();
            keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
            nextKeyObject.readFields(keyDataStream);
            LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash()
                    + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:"
                    + nextKeyObject.getUrlHash());
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet,
        int nItems) throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;/*from   w w  w . j a  v a 2s  .com*/
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        outputSet.add(keyObject);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (++i == nItems) {
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java

License:Apache License

/** 
 * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
 *///from ww w. j a  v a  2s .co m
@Test
public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try {
        // write the ARC File into memory 
        writeFirstRecord(os, "test", timestamp);
        List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            NIOHttpHeaders headers = new NIOHttpHeaders();
            for (int i = 0; i < record.headers.size(); ++i) {
                headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
            }

            write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                    MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
        }
        os.flush();
        os.close();

        final AtomicBoolean streamClosed = new AtomicBoolean();
        // setup ArcFileReader to read the file 
        InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

            public synchronized int read(byte b[], int off, int len) {
                len = 1;
                return super.read(b, off, len);
            }

            public void close() throws IOException {
                super.close();
                streamClosed.set(true);
            }
        };
        ARCFileReader reader = new ARCFileReader(in);
        int index = 0;
        Text key = new Text();
        BytesWritable value = new BytesWritable();

        // iterate and validate stuff ... 
        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            TestRecord testRecord = records.get(index++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(
                    compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            if (indexofHeaderTerminator == -1) {
                throw new IOException("No Header Terminator found in Value!");
            }
            indexofHeaderTerminator += 4;
            // read headers ... 
            String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                    Charset.forName("UTF-8"));
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
            for (int i = 0; i < testRecord.headers.size(); ++i) {
                Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
            }

            Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                    indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
        Assert.assertTrue(streamClosed.get());
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}