List of usage examples for org.apache.hadoop.io WritableUtils readVInt
public static int readVInt(DataInput stream) throws IOException
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
public static void dumpUnCrawledItems(File dataDir, long listId, File outputFilePath, boolean includeRobotsExcludedItems) throws IOException { File fixedDataFile = new File(dataDir, LIST_VALUE_MAP_PREFIX + Long.toString(listId)); File variableDataFile = new File(dataDir, LIST_STRING_MAP_PREFIX + Long.toString(listId)); LOG.info("FixedDataFile is:" + fixedDataFile); LOG.info("VariableDataFile is:" + variableDataFile); RandomAccessFile fixedDataReader = new RandomAccessFile(fixedDataFile, "r"); RandomAccessFile stringDataReader = new RandomAccessFile(variableDataFile, "r"); JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(outputFilePath), 1024 * 1024 * 10)); writer.setIndent(" "); try {//from w w w . j av a2s . co m writer.beginObject(); writer.name("urls"); writer.beginArray(); try { OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); URLFP fingerprint = new URLFP(); while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); item.deserialize(fixedDataReader); // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // setup fingerprint fingerprint.setDomainHash(item._domainHash); fingerprint.setUrlHash(item._urlFingerprint); // any item that has not been crawled needs to be queued boolean queueItem = !item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS); // if item is not queued, check to see if we need to retry the item if (!queueItem && item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { queueItem = (item._redirectStatus != 0); if (!queueItem) { if (item._redirectHttpResult != 200 && item._redirectHttpResult != 404) { queueItem = true; } } } else { queueItem = (item._crawlStatus != 0); if (!queueItem) { if (item._httpResultCode != 200 && item._httpResultCode != 404) { queueItem = true; } } } } if (queueItem) { // ok if queue item is set ... writer.beginObject(); writer.name("url"); writer.value(url); writer.name("redirected"); writer.value((boolean) item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)); writer.name("lastStatus"); if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_REDIRECT_STATUS)) { if (item._redirectStatus == 0) { writer.value("HTTP-" + item._redirectHttpResult); } else { writer.value(CrawlURL.FailureReason.toString(item._redirectHttpResult)); } } else { if (item.isFlagSet(OnDiskCrawlHistoryItem.FLAG_HAS_CRAWL_STATUS)) { if (item._crawlStatus == 0) { writer.value("HTTP-" + item._httpResultCode); } else { writer.value(CrawlURL.FailureReason.toString(item._crawlStatus)); } } else { writer.value("UNCRAWLED"); } } writer.name("updateTime"); writer.value(item._updateTimestamp); writer.endObject(); } } } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + listId + " Exception:" + CCStringUtils.stringifyException(e)); } finally { fixedDataReader.close(); stringDataReader.close(); } writer.endArray(); writer.endObject(); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); throw new IOException(e); } finally { writer.flush(); writer.close(); } }
From source file:org.commoncrawl.service.pagerank.slave.PageRankUtils.java
License:Open Source License
public static final int readURLFPAndCountFromStream(DataInput input, URLFPV2 fpOut) throws IOException { fpOut.setDomainHash(input.readLong()); fpOut.setRootDomainHash(input.readLong()); fpOut.setUrlHash(input.readLong());/*from w ww . j a v a2 s. c om*/ return WritableUtils.readVInt(input); }
From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java
License:Open Source License
private static void spillLinkDataIntoTempFileIndex(FileSystem remoteFileSystem, FileSystem localFileSystem, Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, File tempFilePath, Path outputFilePath, FlexBuffer linkData) throws IOException { SequenceFileSpillWriter<TextBytes, TriTextBytesTuple> outputWriter = new SequenceFileSpillWriter<TextBytes, TriTextBytesTuple>( localFileSystem, conf, outputFilePath, TextBytes.class, TriTextBytesTuple.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(outputFilePath)), true);//from ww w .j a v a2 s .c om try { // ok create merge sort spill writer ... MergeSortSpillWriter<TextBytes, TriTextBytesTuple> merger = new MergeSortSpillWriter<TextBytes, TriTextBytesTuple>( conf, outputWriter, localFileSystem, new Path(tempFilePath.getAbsolutePath()), null, new RawKeyValueComparator<TextBytes, TriTextBytesTuple>() { DataInputBuffer stream1 = new DataInputBuffer(); DataInputBuffer stream2 = new DataInputBuffer(); TriTextBytesTuple tuple1 = new TriTextBytesTuple(); TriTextBytesTuple tuple2 = new TriTextBytesTuple(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { stream1.reset(value1Data, value1Offset, value1Length); stream2.reset(value2Data, value2Offset, value2Length); // ok skip url int url1Length = WritableUtils.readVInt(stream1); stream1.skip(url1Length); int url2Length = WritableUtils.readVInt(stream2); stream2.skip(url2Length); // ok now read optimized page rank stuffed in second tuple WritableUtils.readVInt(stream1); WritableUtils.readVInt(stream2); // now read page rank float pageRank1 = stream1.readFloat(); float pageRank2 = stream2.readFloat(); return (pageRank1 == pageRank2) ? 0 : (pageRank1 < pageRank2) ? -1 : 1; } @Override public int compare(TextBytes key1, TriTextBytesTuple value1, TextBytes key2, TriTextBytesTuple value2) { stream1.reset(value1.getSecondValue().getBytes(), value1.getSecondValue().getLength()); stream2.reset(value2.getSecondValue().getBytes(), value2.getSecondValue().getLength()); try { float pr1 = stream1.readFloat(); float pr2 = stream2.readFloat(); return (pr1 == pr2) ? 0 : pr1 < pr2 ? -1 : 1; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); throw new RuntimeException(); } } }, TextBytes.class, TriTextBytesTuple.class, false, null); try { long timeStart = System.currentTimeMillis(); System.out.println(".Running Merger against to resolve tuple set "); index.bulkQueryURLAndMetadataGivenInputStream(remoteFileSystem, conf, tempFilePath, linkData, merger); long timeEnd = System.currentTimeMillis(); LOG.info(".Merged Successfully in:" + (timeEnd - timeStart)); } finally { LOG.info("Closing Merger"); merger.close(); } } finally { LOG.info("Closing Writer"); outputWriter.close(); } }
From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java
License:Open Source License
@Override protected long executeLocal(FileSystem remoteFileSystem, Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir, QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException { Path mergeResultsPath = new Path( getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName()); LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath); // get a local file system object FileSystem localFileSystem = FileSystem.getLocal(conf); //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists"); // if source merged results path does not exist ... if (!localFileSystem.exists(mergeResultsPath)) { LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath + " Not Found. Checking for parts files"); // collect parts ... Vector<Path> parts = new Vector<Path>(); FileStatus fileStatusArray[] = remoteFileSystem .globStatus(new Path(getHDFSQueryResultsPath(), "part-*")); if (fileStatusArray.length == 0) { LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!"); throw new IOException("Remote Component Part Files Not Found"); }//from w ww . j av a 2s. com for (FileStatus part : fileStatusArray) { //LOG.info("Found Part:"+ part); parts.add(part.getPath()); } LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger"); SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)), false); try { SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>( remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class, new RawKeyValueComparator<Text, SubDomainMetadata>() { DataInputBuffer key1Stream = new DataInputBuffer(); DataInputBuffer key2Stream = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { key1Stream.reset(key1Data, key1Offset, key1Length); key2Stream.reset(key2Data, key2Offset, key2Length); WritableUtils.readVInt(key1Stream); WritableUtils.readVInt(key2Stream); return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(), key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(), key2Length - key2Stream.getPosition()); } @Override public int compare(Text key1, SubDomainMetadata value1, Text key2, SubDomainMetadata value2) { return key1.compareTo(key2); } }); try { LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger"); merger.mergeAndSpill(null); LOG.info("Execute Local for Query:" + getQueryId() + " Merge Successfull.. Deleting Merge Inputs"); for (Path inputPath : parts) { remoteFileSystem.delete(inputPath, false); } } catch (IOException e) { LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:" + CCStringUtils.stringifyException(e)); throw e; } finally { LOG.info("** CLOSING MERGER"); merger.close(); } } finally { LOG.info("** FLUSHING SPILLWRITER"); mergedFileSpillWriter.close(); } } // now check for query specific merge file ... Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject) + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField())); LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath); if (!localFileSystem.exists(queryResultsPath)) { LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath + " does not exist. Running sort and merge process"); LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:" + queryResultsPath); // allocate a spill writer ... SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)), false); try { LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter"); // and connect it to the merge spill writer ... MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>( conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()), /* new RawKeyValueComparator<Text,SubDomainMetadata>() { SubDomainMetadata value1 = new SubDomainMetadata(); SubDomainMetadata value2 = new SubDomainMetadata(); @Override public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) { return value1.getUrlCount() - value2.getUrlCount(); } @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { value1.clear(); value2.clear(); value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length))); value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length))); return compare(null, value1, null, value2); } }, */ new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() { @Override public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { optimizedKeyOut.setLongKeyValue(value.getUrlCount()); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_LONG; } }, Text.class, SubDomainMetadata.class, false, null); try { // create a vector representing the single input segment Vector<Path> singleInputSegment = new Vector<Path>(); LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:" + mergeResultsPath + " as input for Merger"); singleInputSegment.add(mergeResultsPath); // create a SequenceFileReader SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>( localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class, SubDomainMetadata.class); try { LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill"); mergeSegmentReader.readAndSpill(); LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished"); } finally { if (mergeSegmentReader != null) { mergeSegmentReader.close(); } } } finally { if (mergeSortSpillWriter != null) { mergeSortSpillWriter.close(); } } } finally { if (sortedResultsFileSpillWriter != null) { sortedResultsFileSpillWriter.close(); } } } //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath); PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>( localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class); //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount()); return indexFile.getRecordCount(); }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
public static void main(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("URL:" + args[0] + " ShardId:" + args[1]); try {/*from ww w.j a v a 2s.c o m*/ File tempFile = File.createTempFile("inverseLinksReportTest", "seq"); try { FileSystem fs = FileSystem.get(conf); FileSystem localFileSystem = FileSystem.getLocal(conf); URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]); if (fp != null) { collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(), localFileSystem, new Path(tempFile.getAbsolutePath())); SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem, new Path(tempFile.getAbsolutePath()), conf); try { FlexBuffer key = new FlexBuffer(); URLFPV2 src = new URLFPV2(); TextBytes url = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); while (reader.next(key, src)) { inputBuffer.reset(key.get(), key.getOffset(), key.getCount()); long targetFP = inputBuffer.readLong(); float pageRank = inputBuffer.readFloat(); // ok initialize text bytes ... int textLen = WritableUtils.readVInt(inputBuffer); url.set(key.get(), inputBuffer.getPosition(), textLen); LOG.info("PR:" + pageRank + " URL:" + url.toString()); } } finally { reader.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); // tempFile.delete(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.util.FlexBuffer.java
License:Open Source License
@Override public void readFields(DataInput in) throws IOException { int byteCount = WritableUtils.readVInt(in); setCount(byteCount);//from ww w. ja v a 2s . c o m if (byteCount != 0) { // allocate new backing store if shared copyOnWrite(); // read into the array in.readFully(get(), getOffset(), byteCount); } }
From source file:org.commoncrawl.util.shared.FlexBuffer.java
License:Apache License
@Override public void readFields(DataInput in) throws IOException { int byteCount = WritableUtils.readVInt(in); // first zero count ... setCount(0);//from w ww .j ava2s . co m // then set count setCount(byteCount); if (byteCount != 0) { // allocate new backing store if shared copyOnWrite(); // read into the array in.readFully(get(), getOffset(), byteCount); } }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
/** * deserialize/* w ww.ja va 2 s .c om*/ */ public void readFields(DataInput in) throws IOException { int newLength = WritableUtils.readVInt(in); // ensure capacity setCapacity(newLength, false); // in case we need to, ensure we have a private copy of the underlying // array bytes.copyOnWrite(); // read into the array in.readFully(bytes.get(), bytes.getOffset(), newLength); // reset count varaible bytes.setCount(newLength); // clear cached String pointer cachedUTF8 = null; }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
public static void main(String[] args) { // run some tests on the new code String aTestString = new String("A Test Strnig"); // convert it to bytes byte bytes[] = aTestString.getBytes(); // over allocate an array byte overAllocated[] = new byte[bytes.length * 2]; // copy source System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length); // now allocate a TextBytes TextBytes textBytes = new TextBytes(); // set the overallocated buffer as the backing store textBytes.set(overAllocated, bytes.length, bytes.length); // convert it to string first String toString = textBytes.toString(); // validate equal to original Assert.assertTrue(aTestString.equals(toString)); // ok now write it to output buffer DataOutputBuffer outputBuffer = new DataOutputBuffer(); // write string try {/* w ww . j av a2 s . c o m*/ textBytes.write(outputBuffer); // read length DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); int encodedLength = WritableUtils.readVInt(inputBuffer); // validate arrays match ... Assert.assertTrue(encodedLength == bytes.length); Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(), inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0); // ok reset input buffer again ... inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); // read in fields textBytes.readFields(inputBuffer); // ok see if we are not using the original backing store ... Assert.assertTrue(textBytes.getBytes() != overAllocated); // validate buffers match to original Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(), textBytes.getOffset(), textBytes.getLength()), 0); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.commoncrawl.util.WikipediaPage.java
License:Apache License
/** * Serializes this object.// w ww. j a v a 2 s .c o m */ public void readFields(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte[] bytes = new byte[length]; in.readFully(bytes, 0, length); WikipediaPage.readPage(this, new String(bytes)); language = in.readUTF(); }