List of usage examples for org.apache.hadoop.io DataInputBuffer reset
public void reset(byte[] input, int start, int length)
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private OnDiskCrawlHistoryItem loadOnDiskItemForURLFP(URLFP fingerprint) throws IOException { // see if state is cached in memory ... boolean loadedFromMemory = false; synchronized (this) { if (_tempFixedDataBuffer != null) { loadedFromMemory = true;/*from w w w.ja va 2 s . c o m*/ int low = 0; int high = (int) (_tempFixedDataBufferSize / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1; OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); DataInputBuffer inputBuffer = new DataInputBuffer(); int iterationNumber = 0; while (low <= high) { ++iterationNumber; int mid = low + ((high - low) / 2); inputBuffer.reset(_tempFixedDataBuffer, 0, _tempFixedDataBufferSize); inputBuffer.skip(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE); // deserialize itemOut.deserialize(inputBuffer); // now compare it against desired hash value ... int comparisonResult = itemOut.compareFingerprints(fingerprint); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { // cache offset itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE; // LOG.info("Found Match. Took:"+ iterationNumber + " iterations"); // and return item return itemOut; } } //LOG.error("Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations"); } } if (!loadedFromMemory) { //load from disk //LOG.info("Opening Data File for OnDiskItem load for Fingerprint:" + fingerprint.getUrlHash()); RandomAccessFile file = new RandomAccessFile(_fixedDataFile, "rw"); // allocate buffer upfront byte[] onDiskItemBuffer = new byte[OnDiskCrawlHistoryItem.ON_DISK_SIZE]; DataInputBuffer inputStream = new DataInputBuffer(); //LOG.info("Opened Data File. Searching for match"); try { int low = 0; int high = (int) (file.length() / OnDiskCrawlHistoryItem.ON_DISK_SIZE) - 1; OnDiskCrawlHistoryItem itemOut = new OnDiskCrawlHistoryItem(); int iterationNumber = 0; while (low <= high) { ++iterationNumber; int mid = low + ((high - low) / 2); // seek to proper location file.seek(mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE); // read the data structure file.readFully(onDiskItemBuffer, 0, onDiskItemBuffer.length); // map location in file //MappedByteBuffer memoryBuffer = file.getChannel().map(MapMode.READ_ONLY,mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE,OnDiskCrawlHistoryItem.ON_DISK_SIZE); //DataInputStream inputStream = new DataInputStream(new ByteBufferInputStream(memoryBuffer)); inputStream.reset(onDiskItemBuffer, 0, OnDiskCrawlHistoryItem.ON_DISK_SIZE); // deserialize itemOut.deserialize(inputStream); // memoryBuffer = null; //inputStream = null; // now compare it against desired hash value ... int comparisonResult = itemOut.compareFingerprints(fingerprint); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { // cache offset itemOut._fileOffset = mid * OnDiskCrawlHistoryItem.ON_DISK_SIZE; // LOG.info("Found Match. Took:"+ iterationNumber + " iterations"); // and return item return itemOut; } } //LOG.error("******Did Not Find Match For Domain:" + fingerprint.getDomainHash() + " URLFP:" + fingerprint.getUrlHash() + " Took:" + iterationNumber + " iterations"); //DEBUG ONLY ! // dumpFixedDataFile(); } finally { file.close(); } } return null; }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId, long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir();/*from w w w. j ava 2 s . co m*/ try { // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>( outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)), true); try { MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null); try { for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start position // 3. given scan start position, scan forward until fp match is found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream, fs.getLength(indexFilePath), conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry().getValueStream(); int dataOffsetOut = -1; while (valueStream.available() > 0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:" + dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized (_shardToIndexMap) { index = _shardToIndexMap.get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>( fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToIndexMap) { _shardToIndexMap.put(targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader(fs, finalDataPath, conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader, dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); TextBytes urlTxt = new TextBytes(); // ok read to go ... while (dataReader.next(keyBytes, sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(), keyBytes.getOffset(), keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); } else { LOG.info("FP:" + targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { finalMerger.close(); } } finally { spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); } }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
public static void main(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("URL:" + args[0] + " ShardId:" + args[1]); try {// w w w .j av a2 s . c om File tempFile = File.createTempFile("inverseLinksReportTest", "seq"); try { FileSystem fs = FileSystem.get(conf); FileSystem localFileSystem = FileSystem.getLocal(conf); URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]); if (fp != null) { collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(), localFileSystem, new Path(tempFile.getAbsolutePath())); SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem, new Path(tempFile.getAbsolutePath()), conf); try { FlexBuffer key = new FlexBuffer(); URLFPV2 src = new URLFPV2(); TextBytes url = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); while (reader.next(key, src)) { inputBuffer.reset(key.get(), key.getOffset(), key.getCount()); long targetFP = inputBuffer.readLong(); float pageRank = inputBuffer.readFloat(); // ok initialize text bytes ... int textLen = WritableUtils.readVInt(inputBuffer); url.set(key.get(), inputBuffer.getPosition(), textLen); LOG.info("PR:" + pageRank + " URL:" + url.toString()); } } finally { reader.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); // tempFile.delete(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private static void readPaginatedInlinkingDomainInfo(final DatabaseIndexV2.MasterDatabaseIndex masterIndex, FileSystem indexFileSystem, Path indexPath, Path detailPath, int sortOrder, int pageNumber, int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException { // if descending sort order ... // take pageNumber * pageSize as starting point long offset = 0; long startPos = 0; long endPos = 0; FSDataInputStream indexStream = indexFileSystem.open(indexPath); try {// w w w .j av a 2 s .com // read in the total record count ... int totalRecordCount = indexStream.readInt(); LOG.info("***RecordCount:" + totalRecordCount + " Allocating Buffer Of:" + (totalRecordCount * 4) + " bytes. FileLength:" + indexFileSystem.getFileStatus(indexPath).getLen()); // read in index header data upfront byte indexHeaderData[] = new byte[totalRecordCount * 4]; // read it indexStream.readFully(indexHeaderData); // mark string start pos long detailStartPos = indexStream.getPos(); // initialize index header reader stream DataInputBuffer indexHeaderStream = new DataInputBuffer(); indexHeaderStream.reset(indexHeaderData, 0, indexHeaderData.length); resultOut.getResults().clear(); resultOut.setPageNumber(pageNumber); resultOut.setTotalRecordCount(totalRecordCount); if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) { startPos = pageNumber * pageSize; endPos = Math.min(startPos + pageSize, totalRecordCount); offset = pageNumber * pageSize; } else { startPos = totalRecordCount - ((pageNumber + 1) * pageSize); endPos = startPos + pageSize; startPos = Math.max(0, startPos); offset = totalRecordCount - ((pageNumber + 1) * pageSize); } //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset); if (startPos < totalRecordCount) { //LOG.info("Seeking to Offset:" + startPos); indexHeaderStream.skip(startPos * 4); //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)"); for (long i = startPos; i < endPos; ++i) { // read data offset ... int domainDataPos = indexHeaderStream.readInt(); // seek to it indexStream.seek(detailStartPos + domainDataPos); // read the detail data InlinkingDomainInfo domainInfo = new InlinkingDomainInfo(); domainInfo.readFields(indexStream); // ok extract name String domainName = domainInfo.getDomainName(); if (domainName.length() == 0) { //TODO: NEED TO TRACK THIS DOWN domainName = "<<OOPS-NULL>>"; } Text key = new Text(domainName); domainInfo.setFieldClean(InlinkingDomainInfo.Field_DOMAINNAME); if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) { resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, domainInfo)); } else { resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, domainInfo)); } } } } finally { indexStream.close(); } }
From source file:org.commoncrawl.util.CharsetUtils.java
License:Open Source License
/** last resort - detect encoding using charset detector **/ public static String detectCharacterEncoding(byte[] contentBytes, int offset, int length, EncodingDetector detectorType) { if (contentBytes != null && length != 0) { if (detectorType == EncodingDetector.MOZILLA) { DetectorState state = new DetectorState(); nsDetector detector = new nsDetector(nsPSMDetector.ALL); if (offset != 0) { int tempBufferLen = Math.min(length, MAX_CHARS_TO_DETECT); byte[] tempBuffer = new byte[tempBufferLen]; System.arraycopy(contentBytes, offset, tempBuffer, 0, tempBufferLen); contentBytes = tempBuffer; offset = 0;//w w w. j a v a 2 s . c om length = tempBufferLen; } detector.Init(state); boolean isAscii = detector.isAscii(contentBytes, length); if (!isAscii) { isAscii = detector.DoIt(contentBytes, Math.min(length, MAX_CHARS_TO_DETECT), false); } detector.DataEnd(); if (isAscii) { return null; } else if (state._detectedCharset != null) { return state._detectedCharset; } else { String prob[] = detector.getProbableCharsets(); if (prob != null && prob.length != 0) { return prob[0]; } } } else { // instantiate icu charset detector ... CharsetDetector detector = new CharsetDetector(); DataInputBuffer buffer = new DataInputBuffer(); buffer.reset(contentBytes, offset, length); try { detector.setText(buffer); CharsetMatch matches[] = detector.detectAll(); if (matches != null && matches.length != 0) { int kThresold = 10; CharsetMatch bestMatch = null; for (int i = 0; i < matches.length; ++i) { if (bestMatch == null || matches[i].getConfidence() > bestMatch.getConfidence()) { bestMatch = matches[i]; } } if (bestMatch != null) { return bestMatch.getName(); } else { return matches[0].getName(); } } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { } } } return null; }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem) throws IOException { DataOutputBuffer rawKey = new DataOutputBuffer(); DataInputBuffer keyDataStream = new DataInputBuffer(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ValueBytes valueBytes = reader.createValueBytes(); int i = 0;//ww w . ja va 2s. c om while (reader.nextRawKey(rawKey) != -1) { URLFPV2 keyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); keyObject.readFields(keyDataStream); rawKey.reset(); reader.nextRawValue(valueBytes); if (keyObject.compareTo(targetItem) == 0) { reader.nextRawKey(rawKey); URLFPV2 nextKeyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); nextKeyObject.readFields(keyDataStream); LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash() + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:" + nextKeyObject.getUrlHash()); break; } } reader.close(); }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet, int nItems) throws IOException { DataOutputBuffer rawKey = new DataOutputBuffer(); DataInputBuffer keyDataStream = new DataInputBuffer(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ValueBytes valueBytes = reader.createValueBytes(); int i = 0;/*from w w w .jav a2s . c om*/ while (reader.nextRawKey(rawKey) != -1) { URLFPV2 keyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); keyObject.readFields(keyDataStream); outputSet.add(keyObject); rawKey.reset(); reader.nextRawValue(valueBytes); if (++i == nItems) { break; } } reader.close(); }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
public static void main(String[] args) { // run some tests on the new code String aTestString = new String("A Test Strnig"); // convert it to bytes byte bytes[] = aTestString.getBytes(); // over allocate an array byte overAllocated[] = new byte[bytes.length * 2]; // copy source System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length); // now allocate a TextBytes TextBytes textBytes = new TextBytes(); // set the overallocated buffer as the backing store textBytes.set(overAllocated, bytes.length, bytes.length); // convert it to string first String toString = textBytes.toString(); // validate equal to original Assert.assertTrue(aTestString.equals(toString)); // ok now write it to output buffer DataOutputBuffer outputBuffer = new DataOutputBuffer(); // write string try {//from w ww .j a va 2 s . c om textBytes.write(outputBuffer); // read length DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); int encodedLength = WritableUtils.readVInt(inputBuffer); // validate arrays match ... Assert.assertTrue(encodedLength == bytes.length); Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(), inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0); // ok reset input buffer again ... inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); // read in fields textBytes.readFields(inputBuffer); // ok see if we are not using the original backing store ... Assert.assertTrue(textBytes.getBytes() != overAllocated); // validate buffers match to original Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(), textBytes.getOffset(), textBytes.getLength()), 0); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.commoncrawl.util.Tuples.java
License:Open Source License
static void validateTextTuple() { // validate tuple code IntAndTwoTextByteTuples tuple1 = new IntAndTwoTextByteTuples(); IntAndTwoTextByteTuples tuple2 = new IntAndTwoTextByteTuples(); tuple1.setIntValue(1);/*from w w w .ja v a2 s.co m*/ tuple2.setIntValue(1); tuple1.setTextValueBytes(new TextBytes("AAAAA")); tuple2.setTextValueBytes(new TextBytes("AAAAA")); tuple1.setSecondTextValueBytes(new TextBytes("AAAAA")); tuple2.setSecondTextValueBytes(new TextBytes("AAAAB")); // compare the two Assert.assertTrue(tuple1.compareTo(tuple2) == -1); tuple1.setTextValueBytes(new TextBytes("BAAAA")); Assert.assertTrue(tuple1.compareTo(tuple2) == 1); tuple2.setIntValue(2); Assert.assertTrue(tuple1.compareTo(tuple2) == -1); // ok restore ... tuple1.setTextValueBytes(new TextBytes("AAAAA")); tuple2.setTextValueBytes(new TextBytes("AAAAA")); tuple1.setSecondTextValueBytes(new TextBytes("AAAAA")); tuple2.setSecondTextValueBytes(new TextBytes("AAAAB")); DataOutputBuffer outputBuffer = new DataOutputBuffer(); try { tuple1.write(outputBuffer); tuple2.write(outputBuffer); IntAndTwoTextByteTuples tuple3 = new IntAndTwoTextByteTuples(); IntAndTwoTextByteTuples tuple4 = new IntAndTwoTextByteTuples(); DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.getLength()); tuple3.readFields(inputBuffer); tuple4.readFields(inputBuffer); Assert.assertTrue(tuple3.compareTo(tuple1) == 0); Assert.assertTrue(tuple4.compareTo(tuple2) == 0); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.commoncrawl.util.Tuples.java
License:Open Source License
static void validateBufferTuple() { // run some tests on the new code String aTestString = new String("A Test Strnig"); // convert it to bytes byte bytes[] = aTestString.getBytes(); // over allocate an array byte overAllocated[] = new byte[bytes.length * 2]; // copy source System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length); IntBufferTuple tuple1 = new IntBufferTuple(); IntBufferTuple tuple2 = new IntBufferTuple(); tuple1.setIntValue(1);/* www . j av a2s .co m*/ tuple2.setIntValue(1); tuple1.getBuffer().set(overAllocated, bytes.length, bytes.length); tuple2.getBuffer().set(overAllocated, bytes.length, bytes.length); Assert.assertTrue(tuple1.compareTo(tuple2) == 0); DataOutputBuffer outputBuffer = new DataOutputBuffer(); try { tuple1.write(outputBuffer); tuple2.write(outputBuffer); DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.getLength()); tuple1.readFields(inputBuffer); tuple2.readFields(inputBuffer); Assert.assertTrue(tuple1.compareTo(tuple2) == 0); DataOutputBuffer outputBuffer2 = new DataOutputBuffer(); tuple1.write(outputBuffer2); tuple2.write(outputBuffer2); Assert.assertTrue(WritableComparator.compareBytes(outputBuffer.getData(), 0, outputBuffer.getLength(), outputBuffer2.getData(), 0, outputBuffer2.getLength()) == 0); } catch (IOException e) { e.printStackTrace(); } }