List of usage examples for org.apache.hadoop.io DataInputBuffer DataInputBuffer
public DataInputBuffer()
From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java
License:Open Source License
public static void main(String[] args) { if (args.length != 3) { LOG.error("args: [candidate Timestamp] [drive count] [query string]"); }//from w w w . j av a2 s . c o m // initialize ... final Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure(); CrawlEnvironment.setHadoopConfig(conf); long candidateTS = Long.parseLong(args[0]); int driveCount = Integer.parseInt(args[1]); String queryString = args[2]; try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS, null); SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS); // ok hit the domain against the master index first ... LOG.info("Querying master index for DomainId Given DomainName:" + queryString); long domainId = masterIndex.queryDomainIdGivenDomain(queryString); LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId); SubDomainMetadata subDomainMeta = masterIndex.queryDomainMetadataGivenDomainId(domainId); if (subDomainMeta != null) { LOG.info("Metadata is present. Deserializing"); // dump some fields ... LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount() + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:" + subDomainMeta.getHasPageRankCount()); // ok time to dive into a url list ... // query for a list of urls sorted by name LOG.info("Querying for URLList for Domain BY PR"); FlexBuffer urlListBufferByPR = slaveIndex.queryURLListSortedByPR(domainId); if (urlListBufferByPR != null) { // read the list ... DataInputBuffer readerStream = new DataInputBuffer(); readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR.getCount()); int totalItemCount = urlListBufferByPR.getCount() / 8; System.out.println("List BY PR totalCount:" + totalItemCount); // initialize a fingerprint object to use for queries ... URLFPV2 queryFP = new URLFPV2(); queryFP.setDomainHash(domainId); DataInputBuffer metadataReaderStream = new DataInputBuffer(); // iterate the first N items ranked by page rank for (int i = 0; i < Math.min(10, totalItemCount); ++i) { queryFP.setUrlHash(readerStream.readLong()); // and for metadata MetadataOut urlMetadata = masterIndex.queryMetadataAndURLGivenFP(queryFP); if (urlMetadata != null) { // decode the url String url = urlMetadata.url.toString(); System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url); if (urlMetadata.datumAndMetadataBytes.getLength() == 0) { System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!"); } else { // explode metadata CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata(); metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes.getBytes(), urlMetadata.datumAndMetadataBytes.getOffset(), urlMetadata.datumAndMetadataBytes.getLength()); metadataObject.readFields(metadataReaderStream); // ok at this point spit out stuff for this url StringBuilder urlInfo = new StringBuilder(); urlInfo.append(" FetchStatus:" + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n"); urlInfo.append(" PageRank:" + metadataObject.getMetadata().getPageRank() + "\n"); urlInfo.append( " ContentType:" + metadataObject.getMetadata().getContentType() + "\n"); urlInfo.append(" ArcFileInfoCount:" + metadataObject.getMetadata().getArchiveInfo().size()); if (metadataObject.getMetadata() .isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) { urlInfo.append( " HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo() + ":" + metadataObject.getMetadata().getLinkDBOffset()); } if (metadataObject.getMetadata() .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) { urlInfo.append(" HasINVLinkDataInfo:" + metadataObject.getMetadata().getInverseDBFileNo() + ":" + metadataObject.getMetadata().getInverseDBOffset()); } System.out.println(urlInfo.toString()); // now if inverse link data is present .. if (metadataObject.getMetadata() .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) { // get it ... System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash()); FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP, metadataObject.getMetadata().getInverseDBFileNo(), metadataObject.getMetadata().getInverseDBOffset()); if (inlinks != null) { System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount()); FileSystem localFS = FileSystem.getLocal(conf); File testDir = new File("/tmp/dbIndexTest"); File testFile = new File("/tmp/dbIndexTestFile"); localFS.delete(new Path(testDir.getAbsolutePath()), true); localFS.delete(new Path(testFile.getAbsolutePath()), false); localFS.mkdirs(new Path(testDir.getAbsolutePath())); LOG.info("Creating Spill File of Inlinks"); spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir, new Path(testFile.getAbsolutePath()), inlinks); LOG.info("Created Spill File of Inlinks"); LOG.info("Reading Inlinks"); // ok now open it up and dump the first few inlinks from the // spill file SequenceFile.Reader reader = new SequenceFile.Reader(localFS, new Path(testFile.getAbsolutePath()), conf); TextBytes key = new TextBytes(); TriTextBytesTuple value = new TriTextBytesTuple(); CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata(); DataInputBuffer inputBuffer = new DataInputBuffer(); try { int itemCount = 0; while (reader.next(key, value)) { if (value.getThirdValue().getLength() != 0) { inputBuffer.reset(value.getThirdValue().getBytes(), 0, value.getThirdValue().getLength()); metadata.readFields(inputBuffer); System.out.println("INLINK:" + key.toString() + " METADATA STATUS:" + CrawlDatum.getStatusName(metadata.getStatus())); } else { System.out.println("INLINK:" + key.toString() + " NOMETADATA"); } if (++itemCount == 500) { break; } } } finally { reader.close(); } LOG.info("Done Reding Inlinks"); } } } } else { LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL"); } } } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java
License:Open Source License
@Override protected long executeLocal(FileSystem remoteFileSystem, Configuration conf, DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir, QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException { Path mergeResultsPath = new Path( getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName()); LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath); // get a local file system object FileSystem localFileSystem = FileSystem.getLocal(conf); //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists"); // if source merged results path does not exist ... if (!localFileSystem.exists(mergeResultsPath)) { LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath + " Not Found. Checking for parts files"); // collect parts ... Vector<Path> parts = new Vector<Path>(); FileStatus fileStatusArray[] = remoteFileSystem .globStatus(new Path(getHDFSQueryResultsPath(), "part-*")); if (fileStatusArray.length == 0) { LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!"); throw new IOException("Remote Component Part Files Not Found"); }/* w w w . j a va2 s.c om*/ for (FileStatus part : fileStatusArray) { //LOG.info("Found Part:"+ part); parts.add(part.getPath()); } LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger"); SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)), false); try { SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>( remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class, new RawKeyValueComparator<Text, SubDomainMetadata>() { DataInputBuffer key1Stream = new DataInputBuffer(); DataInputBuffer key2Stream = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { key1Stream.reset(key1Data, key1Offset, key1Length); key2Stream.reset(key2Data, key2Offset, key2Length); WritableUtils.readVInt(key1Stream); WritableUtils.readVInt(key2Stream); return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(), key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(), key2Length - key2Stream.getPosition()); } @Override public int compare(Text key1, SubDomainMetadata value1, Text key2, SubDomainMetadata value2) { return key1.compareTo(key2); } }); try { LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger"); merger.mergeAndSpill(null); LOG.info("Execute Local for Query:" + getQueryId() + " Merge Successfull.. Deleting Merge Inputs"); for (Path inputPath : parts) { remoteFileSystem.delete(inputPath, false); } } catch (IOException e) { LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:" + CCStringUtils.stringifyException(e)); throw e; } finally { LOG.info("** CLOSING MERGER"); merger.close(); } } finally { LOG.info("** FLUSHING SPILLWRITER"); mergedFileSpillWriter.close(); } } // now check for query specific merge file ... Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject) + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField())); LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath); if (!localFileSystem.exists(queryResultsPath)) { LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath + " does not exist. Running sort and merge process"); LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:" + queryResultsPath); // allocate a spill writer ... SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)), false); try { LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter"); // and connect it to the merge spill writer ... MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>( conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()), /* new RawKeyValueComparator<Text,SubDomainMetadata>() { SubDomainMetadata value1 = new SubDomainMetadata(); SubDomainMetadata value2 = new SubDomainMetadata(); @Override public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) { return value1.getUrlCount() - value2.getUrlCount(); } @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { value1.clear(); value2.clear(); value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length))); value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length))); return compare(null, value1, null, value2); } }, */ new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() { @Override public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value, org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut) throws IOException { optimizedKeyOut.setLongKeyValue(value.getUrlCount()); } @Override public int getGeneratedKeyType() { return OptimizedKey.KEY_TYPE_LONG; } }, Text.class, SubDomainMetadata.class, false, null); try { // create a vector representing the single input segment Vector<Path> singleInputSegment = new Vector<Path>(); LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:" + mergeResultsPath + " as input for Merger"); singleInputSegment.add(mergeResultsPath); // create a SequenceFileReader SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>( localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class, SubDomainMetadata.class); try { LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill"); mergeSegmentReader.readAndSpill(); LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished"); } finally { if (mergeSegmentReader != null) { mergeSegmentReader.close(); } } } finally { if (mergeSortSpillWriter != null) { mergeSortSpillWriter.close(); } } } finally { if (sortedResultsFileSpillWriter != null) { sortedResultsFileSpillWriter.close(); } } } //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath); PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>( localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class); //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount()); return indexFile.getRecordCount(); }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId, long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir();//from w w w . j a v a 2 s . c o m try { // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>( outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)), true); try { MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null); try { for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start position // 3. given scan start position, scan forward until fp match is found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream, fs.getLength(indexFilePath), conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry().getValueStream(); int dataOffsetOut = -1; while (valueStream.available() > 0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:" + dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized (_shardToIndexMap) { index = _shardToIndexMap.get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>( fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToIndexMap) { _shardToIndexMap.put(targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader(fs, finalDataPath, conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader, dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); TextBytes urlTxt = new TextBytes(); // ok read to go ... while (dataReader.next(keyBytes, sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(), keyBytes.getOffset(), keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); } else { LOG.info("FP:" + targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { finalMerger.close(); } } finally { spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); } }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
public static void main(String[] args) { // initialize ... Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); LOG.info("URL:" + args[0] + " ShardId:" + args[1]); try {//from w w w . ja va 2s.c om File tempFile = File.createTempFile("inverseLinksReportTest", "seq"); try { FileSystem fs = FileSystem.get(conf); FileSystem localFileSystem = FileSystem.getLocal(conf); URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]); if (fp != null) { collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(), localFileSystem, new Path(tempFile.getAbsolutePath())); SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem, new Path(tempFile.getAbsolutePath()), conf); try { FlexBuffer key = new FlexBuffer(); URLFPV2 src = new URLFPV2(); TextBytes url = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); while (reader.next(key, src)) { inputBuffer.reset(key.get(), key.getOffset(), key.getCount()); long targetFP = inputBuffer.readLong(); float pageRank = inputBuffer.readFloat(); // ok initialize text bytes ... int textLen = WritableUtils.readVInt(inputBuffer); url.set(key.get(), inputBuffer.getPosition(), textLen); LOG.info("PR:" + pageRank + " URL:" + url.toString()); } } finally { reader.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); // tempFile.delete(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private static void readPaginatedInlinkingDomainInfo(final DatabaseIndexV2.MasterDatabaseIndex masterIndex, FileSystem indexFileSystem, Path indexPath, Path detailPath, int sortOrder, int pageNumber, int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException { // if descending sort order ... // take pageNumber * pageSize as starting point long offset = 0; long startPos = 0; long endPos = 0; FSDataInputStream indexStream = indexFileSystem.open(indexPath); try {/*from w w w .j ava2 s. c om*/ // read in the total record count ... int totalRecordCount = indexStream.readInt(); LOG.info("***RecordCount:" + totalRecordCount + " Allocating Buffer Of:" + (totalRecordCount * 4) + " bytes. FileLength:" + indexFileSystem.getFileStatus(indexPath).getLen()); // read in index header data upfront byte indexHeaderData[] = new byte[totalRecordCount * 4]; // read it indexStream.readFully(indexHeaderData); // mark string start pos long detailStartPos = indexStream.getPos(); // initialize index header reader stream DataInputBuffer indexHeaderStream = new DataInputBuffer(); indexHeaderStream.reset(indexHeaderData, 0, indexHeaderData.length); resultOut.getResults().clear(); resultOut.setPageNumber(pageNumber); resultOut.setTotalRecordCount(totalRecordCount); if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) { startPos = pageNumber * pageSize; endPos = Math.min(startPos + pageSize, totalRecordCount); offset = pageNumber * pageSize; } else { startPos = totalRecordCount - ((pageNumber + 1) * pageSize); endPos = startPos + pageSize; startPos = Math.max(0, startPos); offset = totalRecordCount - ((pageNumber + 1) * pageSize); } //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset); if (startPos < totalRecordCount) { //LOG.info("Seeking to Offset:" + startPos); indexHeaderStream.skip(startPos * 4); //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)"); for (long i = startPos; i < endPos; ++i) { // read data offset ... int domainDataPos = indexHeaderStream.readInt(); // seek to it indexStream.seek(detailStartPos + domainDataPos); // read the detail data InlinkingDomainInfo domainInfo = new InlinkingDomainInfo(); domainInfo.readFields(indexStream); // ok extract name String domainName = domainInfo.getDomainName(); if (domainName.length() == 0) { //TODO: NEED TO TRACK THIS DOWN domainName = "<<OOPS-NULL>>"; } Text key = new Text(domainName); domainInfo.setFieldClean(InlinkingDomainInfo.Field_DOMAINNAME); if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) { resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, domainInfo)); } else { resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, domainInfo)); } } } } finally { indexStream.close(); } }
From source file:org.commoncrawl.util.CharsetUtils.java
License:Open Source License
/** last resort - detect encoding using charset detector **/ public static String detectCharacterEncoding(byte[] contentBytes, int offset, int length, EncodingDetector detectorType) { if (contentBytes != null && length != 0) { if (detectorType == EncodingDetector.MOZILLA) { DetectorState state = new DetectorState(); nsDetector detector = new nsDetector(nsPSMDetector.ALL); if (offset != 0) { int tempBufferLen = Math.min(length, MAX_CHARS_TO_DETECT); byte[] tempBuffer = new byte[tempBufferLen]; System.arraycopy(contentBytes, offset, tempBuffer, 0, tempBufferLen); contentBytes = tempBuffer; offset = 0;/*from www . ja va 2s .c om*/ length = tempBufferLen; } detector.Init(state); boolean isAscii = detector.isAscii(contentBytes, length); if (!isAscii) { isAscii = detector.DoIt(contentBytes, Math.min(length, MAX_CHARS_TO_DETECT), false); } detector.DataEnd(); if (isAscii) { return null; } else if (state._detectedCharset != null) { return state._detectedCharset; } else { String prob[] = detector.getProbableCharsets(); if (prob != null && prob.length != 0) { return prob[0]; } } } else { // instantiate icu charset detector ... CharsetDetector detector = new CharsetDetector(); DataInputBuffer buffer = new DataInputBuffer(); buffer.reset(contentBytes, offset, length); try { detector.setText(buffer); CharsetMatch matches[] = detector.detectAll(); if (matches != null && matches.length != 0) { int kThresold = 10; CharsetMatch bestMatch = null; for (int i = 0; i < matches.length; ++i) { if (bestMatch == null || matches[i].getConfidence() > bestMatch.getConfidence()) { bestMatch = matches[i]; } } if (bestMatch != null) { return bestMatch.getName(); } else { return matches[0].getName(); } } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { } } } return null; }
From source file:org.commoncrawl.util.CompressedURLFPListV2.java
License:Open Source License
public static void main(String[] args) { // initialize ... final Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure();//from ww w . ja v a 2 s.co m CrawlEnvironment.setHadoopConfig(conf); try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Path testFile = new Path("crawl/linkdb/merged1282844121161/linkData/part-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(fs, testFile, conf); URLFPV2 fp = new URLFPV2(); BytesWritable bytes = new BytesWritable(); while (reader.next(fp, bytes)) { if (bytes.getLength() != 0) { DataInputBuffer inputStream = new DataInputBuffer(); inputStream.reset(bytes.get(), bytes.getLength()); CompressedURLFPListV2.Reader listReader = new CompressedURLFPListV2.Reader(inputStream); while (listReader.hasNext()) { URLFPV2 nextFP = listReader.next(); LOG.info("DH:" + nextFP.getDomainHash() + " UH:" + nextFP.getUrlHash()); } } else { LOG.error("ZERO BYTE LIST!"); } } reader.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } if (1 == 1) return; validateDuplicateChecking(); // validateReallyBigList(); validateURLFPSerializationRootDomain(); validateURLFPSerializationSingleSubDomain(); validateURLFPSerializationMultiDomain(); validateURLFPFlagSerializationRootDomain(); validateURLFPFlagSerializationMultipleSubDomains(); validateURLFPFlagSerializationOneSubDomain(); }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem) throws IOException { DataOutputBuffer rawKey = new DataOutputBuffer(); DataInputBuffer keyDataStream = new DataInputBuffer(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ValueBytes valueBytes = reader.createValueBytes(); int i = 0;//from w ww .j av a 2 s. c om while (reader.nextRawKey(rawKey) != -1) { URLFPV2 keyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); keyObject.readFields(keyDataStream); rawKey.reset(); reader.nextRawValue(valueBytes); if (keyObject.compareTo(targetItem) == 0) { reader.nextRawKey(rawKey); URLFPV2 nextKeyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); nextKeyObject.readFields(keyDataStream); LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash() + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:" + nextKeyObject.getUrlHash()); break; } } reader.close(); }
From source file:org.commoncrawl.util.MultiFileMergeUtils.java
License:Open Source License
static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet, int nItems) throws IOException { DataOutputBuffer rawKey = new DataOutputBuffer(); DataInputBuffer keyDataStream = new DataInputBuffer(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ValueBytes valueBytes = reader.createValueBytes(); int i = 0;/* www . j a v a 2 s. co m*/ while (reader.nextRawKey(rawKey) != -1) { URLFPV2 keyObject = new URLFPV2(); keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength()); keyObject.readFields(keyDataStream); outputSet.add(keyObject); rawKey.reset(); reader.nextRawValue(valueBytes); if (++i == nItems) { break; } } reader.close(); }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
public static void main(String[] args) { // run some tests on the new code String aTestString = new String("A Test Strnig"); // convert it to bytes byte bytes[] = aTestString.getBytes(); // over allocate an array byte overAllocated[] = new byte[bytes.length * 2]; // copy source System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length); // now allocate a TextBytes TextBytes textBytes = new TextBytes(); // set the overallocated buffer as the backing store textBytes.set(overAllocated, bytes.length, bytes.length); // convert it to string first String toString = textBytes.toString(); // validate equal to original Assert.assertTrue(aTestString.equals(toString)); // ok now write it to output buffer DataOutputBuffer outputBuffer = new DataOutputBuffer(); // write string try {/* w w w.ja v a 2 s. co m*/ textBytes.write(outputBuffer); // read length DataInputBuffer inputBuffer = new DataInputBuffer(); inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); int encodedLength = WritableUtils.readVInt(inputBuffer); // validate arrays match ... Assert.assertTrue(encodedLength == bytes.length); Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(), inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0); // ok reset input buffer again ... inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size()); // read in fields textBytes.readFields(inputBuffer); // ok see if we are not using the original backing store ... Assert.assertTrue(textBytes.getBytes() != overAllocated); // validate buffers match to original Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(), textBytes.getOffset(), textBytes.getLength()), 0); } catch (IOException e) { e.printStackTrace(); } }