List of usage examples for org.apache.hadoop.io DataOutputBuffer DataOutputBuffer
public DataOutputBuffer()
From source file:org.commoncrawl.service.parser.client.Dispatcher.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); CrawlEnvironment.setHadoopConfig(conf); String baseURL = "http://unknown.com/"; if (args.length != 0) { baseURL = args[0];// w w w. j av a 2 s .c o m } URL baseURLObj; try { baseURLObj = new URL(baseURL); } catch (MalformedURLException e2) { throw new IOException("Invalid Base Link"); } final URL finalBaseURL = (baseURLObj != null) ? baseURLObj : null; final DataOutputBuffer headerBuffer = new DataOutputBuffer(); final DataOutputBuffer contentBuffer = new DataOutputBuffer(); try { ByteStreams.readBytes(new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return System.in; } }, new ByteProcessor<Long>() { @Override public Long getResult() { return 0L; } int currLineCharCount = 0; boolean processingHeaders = true; @Override public boolean processBytes(byte[] buf, int start, int length) throws IOException { if (processingHeaders) { int current = start; int end = current + length; while (processingHeaders && current != end) { if (buf[current] != '\r' && buf[current] != '\n') { currLineCharCount++; } else if (buf[current] == '\n') { if (currLineCharCount == 0) { headerBuffer.write(buf, start, current - start + 1); processingHeaders = false; } currLineCharCount = 0; } current++; } if (processingHeaders) { headerBuffer.write(buf, start, length); } else { length -= current - start; start = current; } } if (!processingHeaders) { contentBuffer.write(buf, start, length); } return true; } }); LOG.info("HEADER LEN:" + headerBuffer.getLength()); // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8"))); LOG.info("CONTENT LEN:" + contentBuffer.getLength()); //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8"))); // decode header bytes ... String header = ""; if (headerBuffer.getLength() != 0) { try { header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("UTF-8")); } catch (Exception e) { LOG.warn(CCStringUtils.stringifyException(e)); header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("ASCII")); } } final String headersFinal = (header != null) ? header : ""; LOG.info("Starting Event Loop"); final EventLoop eventLoop = new EventLoop(); eventLoop.start(); try { // create fake hosts file ... //String hosts = "10.0.20.101:8072"; // reader //Reader reader = new StringReader(hosts); // dispatcher init LOG.info("initializing Dispatcher"); final Dispatcher dispatcher = new Dispatcher(eventLoop, "parserNodes"); LOG.info("Waiting for a few seconds"); Thread.sleep(5000); Thread threads[] = new Thread[TEST_THREAD_COUNT]; final Semaphore threadWaitSem = new Semaphore(-TEST_THREAD_COUNT - 1); // start 100 threads for (int threadIdx = 0; threadIdx < TEST_THREAD_COUNT; ++threadIdx) { threads[threadIdx] = new Thread(new Runnable() { @Override public void run() { for (int i = 0; i < ITERATIONS_PER_THREAD; ++i) { // build parse request ParseRequest request = new ParseRequest(); request.setDocId(1); request.setDomainId(1); request.setDocURL(finalBaseURL.toString()); request.setDocHeaders(headersFinal); request.setDocContent( new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength())); //LOG.info("Dispatching parse request"); ParseResult result = dispatcher.dispatchRequest(request); LOG.info("TID[" + Thread.currentThread().getId() + "]ReqID[" + i + "]" + " Success:" + ((result != null) ? result.getParseSuccessful() : false) + " LinkCount:" + ((result != null) ? result.getExtractedLinks().size() : 0)); } LOG.info("Thread:" + Thread.currentThread().getId() + " Exiting"); threadWaitSem.release(); } }); threads[threadIdx].start(); } LOG.info("Waiting for threads to die"); threadWaitSem.acquireUninterruptibly(); LOG.info("All Threads dead."); } finally { eventLoop.stop(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } catch (InterruptedException e) { } }
From source file:org.commoncrawl.service.parser.server.ParseWorker.java
License:Open Source License
public static void main(String[] args) throws IOException { String baseURL = "http://unknown.com/"; if (args.length != 0) { baseURL = args[0];//from ww w .j a v a 2 s .com } URL baseURLObj; try { baseURLObj = new URL(baseURL); } catch (MalformedURLException e2) { throw new IOException("Invalid Base Link"); } final DataOutputBuffer headerBuffer = new DataOutputBuffer(); final DataOutputBuffer contentBuffer = new DataOutputBuffer(); try { ByteStreams.readBytes(new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return System.in; } }, new ByteProcessor<Long>() { @Override public Long getResult() { return 0L; } int currLineCharCount = 0; boolean processingHeaders = true; @Override public boolean processBytes(byte[] buf, int start, int length) throws IOException { if (processingHeaders) { int current = start; int end = current + length; while (processingHeaders && current != end) { if (buf[current] != '\r' && buf[current] != '\n') { currLineCharCount++; } else if (buf[current] == '\n') { if (currLineCharCount == 0) { headerBuffer.write(buf, start, current - start + 1); processingHeaders = false; } currLineCharCount = 0; } current++; } if (processingHeaders) { headerBuffer.write(buf, start, length); } else { length -= current - start; start = current; } } if (!processingHeaders) { contentBuffer.write(buf, start, length); } return true; } }); //LOG.info("HEADER LEN:" + headerBuffer.getLength()); // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8"))); //LOG.info("CONTENT LEN:" + contentBuffer.getLength()); //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8"))); // decode header bytes ... String header = ""; if (headerBuffer.getLength() != 0) { try { header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("UTF-8")); } catch (Exception e) { LOG.warn(CCStringUtils.stringifyException(e)); header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("ASCII")); } } //LOG.info("Parsing Document"); ParseWorker worker = new ParseWorker(); ParseResult result = new ParseResult(); worker.parseDocument(result, 0L, 0L, baseURLObj, header, new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength())); //LOG.info("Parse Result:" + result.getParseSuccessful()); //LOG.info("Parse Data:" + result.toString()); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } /* List<String> lines; try { lines = IOUtils.readLines(System.in, "UTF-8"); for (String line : lines){ System.out.println(line); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }*/ }
From source file:org.commoncrawl.service.queryserver.index.PositionBasedSequenceFileIndex.java
License:Open Source License
public void seekReaderToItemAtIndex(SequenceFile.Reader reader, long desiredIndexPos) throws IOException { IndexItem indexItem = findIndexDataPosForItemIndex(desiredIndexPos); if (indexItem == null) { throw new IOException("Invalid Index Position:" + desiredIndexPos); }//from www. j a v a2 s.c o m //LOG.info("Seeking to appropriate position in file"); long timeStart = System.currentTimeMillis(); reader.seek(indexItem._offsetValue); //LOG.info("Seek Took:" + (System.currentTimeMillis() - timeStart)); DataOutputBuffer skipBuffer = new DataOutputBuffer() { @Override public void write(DataInput in, int length) throws IOException { in.skipBytes(length); } }; timeStart = System.currentTimeMillis(); int skipCount = 0; ValueBytes skipValue = reader.createValueBytes(); long currentIndexPos = indexItem._indexValue; while (currentIndexPos < desiredIndexPos) { reader.nextRawKey(skipBuffer); reader.nextRawValue(skipValue); ++skipCount; ++currentIndexPos; } //LOG.info("Skip of:" + skipCount +" Values took:" + (System.currentTimeMillis() - timeStart)); }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId, long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir();//from w w w. j av a 2 s . c o m try { // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>( outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)), true); try { MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null); try { for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start position // 3. given scan start position, scan forward until fp match is found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream, fs.getLength(indexFilePath), conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry().getValueStream(); int dataOffsetOut = -1; while (valueStream.available() > 0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:" + dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized (_shardToIndexMap) { index = _shardToIndexMap.get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>( fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToIndexMap) { _shardToIndexMap.put(targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader(fs, finalDataPath, conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader, dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); TextBytes urlTxt = new TextBytes(); // ok read to go ... while (dataReader.next(keyBytes, sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(), keyBytes.getOffset(), keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); } else { LOG.info("FP:" + targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { finalMerger.close(); } } finally { spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem, Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath, Path inlinksDetailOutputPath) throws IOException { long recordCount = 0L; outputFileSystem.delete(inlinksDomainIndexPath); outputFileSystem.delete(inlinksDetailOutputPath); FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath); try {//www. j a v a 2s . c o m FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath); FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath); ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>(); try { LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath); CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream); InlinkingDomainInfo lastDomain = null; while (reader.hasNext()) { // read the nex fingerprint URLFPV2 fingerprint = reader.next(); // and first see if we have a domain transition if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) { // remember the domain lastDomain = new InlinkingDomainInfo(); lastDomain.setDomainId(fingerprint.getDomainHash()); // add it to the list domainList.add(lastDomain); // update date position lastDomain.setUrlDataPos(detailOutputStream.getPos()); } // increment url count for the domain lastDomain.setUrlCount(lastDomain.getUrlCount() + 1); detailOutputStream.writeLong(fingerprint.getDomainHash()); detailOutputStream.writeLong(fingerprint.getUrlHash()); recordCount++; } LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records"); // ok, now resolve domain names for (InlinkingDomainInfo domain : domainList) { SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId()); if (metadata == null) { LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId()); } else { if (metadata.getDomainText().length() == 0) { LOG.error("*** Metadata for Domain Id:" + domain.getDomainId() + " contained NULL Name Value."); domain.setDomainName("_ERROR:BAD RECORD"); } else { domain.setDomainName(metadata.getDomainText()); } //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount()); } } LOG.info("Sorting Domain List of Size:" + domainList.size()); // ok sort by domain name Collections.sort(domainList); LOG.info("Building In Memory Index"); // ok write out domain info DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer(); DataOutputBuffer indexDataBuffer = new DataOutputBuffer(); LOG.info("***Writing Domain List Size:" + domainList.size()); indexHeaderBuffer.writeInt(domainList.size()); // ok iterate and write to both buffers for (InlinkingDomainInfo domain : domainList) { indexHeaderBuffer.writeInt(indexDataBuffer.getLength()); domain.write(indexDataBuffer); } LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:" + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength()); // ok now flush both buffers to disk indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength()); indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength()); } finally { indexOutputStream.flush(); indexOutputStream.close(); detailOutputStream.flush(); detailOutputStream.close(); } } finally { remoteInputStream.close(); } return recordCount; }
From source file:org.commoncrawl.util.CrawlLogSplitter.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*")); for (FileStatus candidate : arcFiles) { if (candidate.getLen() > SPLIT_SIZE) { candidateList.add(candidate.getPath()); }//w ww.j av a2 s.co m } LOG.info("Found:" + candidateList.size() + " oversized candidates"); Path tempOutputDir = new Path(conf.get("mapred.temp.dir", ".")); while (candidateList.size() != 0) { Path candidateName = candidateList.first(); candidateList.remove(candidateName); LOG.info("Processing Candidate:" + candidateName); long fileSize = fs.getFileStatus(candidateName).getLen(); //get crawl log filename components ArrayList<Path> splitItems = new ArrayList<Path>(); int index = 0; Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index); LOG.info("Initial Output Path is:" + outputPart); fs.delete(outputPart, false); // create reader SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf); ValueBytes sourceVB = reader.createValueBytes(); DataOutputBuffer sourceKeyData = new DataOutputBuffer(); try { // ok create temp file SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); // add to split items array splitItems.add(outputPart); try { long recordsWritten = 0; while (reader.nextRawKey(sourceKeyData) != -1) { reader.nextRawValue(sourceVB); long lengthPreWrite = activeWriter.getLength(); activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB); if (++recordsWritten % 10000 == 0) { LOG.info("Write 10000 records"); } long lengthPostWrite = activeWriter.getLength(); if (lengthPostWrite != lengthPreWrite) { if (lengthPostWrite >= IDEAL_SIZE) { LOG.info("Hit Split Point. Flushing File:" + outputPart); activeWriter.close(); outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), ++index); LOG.info("Creating New File:" + outputPart); activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); splitItems.add(outputPart); } } sourceKeyData.reset(); } } finally { activeWriter.close(); } } finally { reader.close(); } LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files"); for (Path splitItem : splitItems) { Path destPath = new Path("crawl/checkpoint_data", splitItem.getName()); LOG.info("Moving:" + splitItem + " to:" + destPath); fs.rename(splitItem, destPath); } Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName()); LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation); fs.rename(candidateName, sourceMoveLocation); } }
From source file:org.commoncrawl.util.JoinByTextSortByTagMapper.java
License:Open Source License
public static void main(String[] args) throws IOException { DataOutputBuffer outputBuffer = new DataOutputBuffer(); TextBytes textKey = new TextBytes("test"); TextBytes tagValue = new TextBytes("tag"); TextBytes textOut = new TextBytes(); makeCompositeKey(outputBuffer, textKey, tagValue, textOut); System.out.println("CompositeKey:" + textOut.toString()); TextBytes textKeyOut = new TextBytes(); TextBytes tagValueOut = new TextBytes(); getKeyFromCompositeKey(textOut, textKeyOut); getTagFromCompositeKey(textOut, tagValueOut); Assert.assertTrue(textKey.compareTo(textKeyOut) == 0); Assert.assertTrue(tagValue.compareTo(tagValueOut) == 0); }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem) throws IOException { DataOutputBuffer rawKey = new DataOutputBuffer(); DataInputBuffer keyDataStream = new DataInputBuffer(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ValueBytes valueBytes = reader.createValueBytes(); int i = 0;/*from www. ja va 2 s. co m*/ while (reader.nextRawKey(rawKey) != -1) { URLFPV2 keyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); keyObject.readFields(keyDataStream); rawKey.reset(); reader.nextRawValue(valueBytes); if (keyObject.compareTo(targetItem) == 0) { reader.nextRawKey(rawKey); URLFPV2 nextKeyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); nextKeyObject.readFields(keyDataStream); LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash() + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:" + nextKeyObject.getUrlHash()); break; } } reader.close(); }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet, int nItems) throws IOException { DataOutputBuffer rawKey = new DataOutputBuffer(); DataInputBuffer keyDataStream = new DataInputBuffer(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ValueBytes valueBytes = reader.createValueBytes(); int i = 0;/*from w w w . j a v a 2s .com*/ while (reader.nextRawKey(rawKey) != -1) { URLFPV2 keyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); keyObject.readFields(keyDataStream); outputSet.add(keyObject); rawKey.reset(); reader.nextRawValue(valueBytes); if (++i == nItems) { break; } } reader.close(); }
From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java
License:Apache License
/** * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... *///from ww w. j a v a 2s .co m @Test public void testReader() { DataOutputBuffer os = new DataOutputBuffer(); long timestamp = System.currentTimeMillis(); try { // write the ARC File into memory writeFirstRecord(os, "test", timestamp); List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (int i = 0; i < record.headers.size(); ++i) { headers.set(record.headers.get(i).e0, record.headers.get(i).e1); } write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); os.close(); final AtomicBoolean streamClosed = new AtomicBoolean(); // setup ArcFileReader to read the file InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) { public synchronized int read(byte b[], int off, int len) { len = 1; return super.read(b, off, len); } public void close() throws IOException { super.close(); streamClosed.set(true); } }; ARCFileReader reader = new ARCFileReader(in); int index = 0; Text key = new Text(); BytesWritable value = new BytesWritable(); // iterate and validate stuff ... while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue( compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); if (indexofHeaderTerminator == -1) { throw new IOException("No Header Terminator found in Value!"); } indexofHeaderTerminator += 4; // read headers ... String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator, Charset.forName("UTF-8")); NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText); for (int i = 0; i < testRecord.headers.size(); ++i) { Pair<String, String> testHeaderRecord = testRecord.headers.get(i); Assert.assertNotNull(headers.findValue(testHeaderRecord.e0)); Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0)); } Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT); Assert.assertTrue(streamClosed.get()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }