List of usage examples for org.apache.hadoop.io DataInputBuffer reset
public void reset(byte[] input, int length)
From source file:org.apache.tez.service.impl.ContainerRunnerImpl.java
License:Apache License
/** * Submit an entire work unit - containerId + TaskSpec. * This is intended for a task push from the AM * * @param request//from www .j av a 2 s. co m * @throws org.apache.tez.dag.api.TezException */ @Override public void submitWork(SubmitWorkRequestProto request) throws TezException { LOG.info("Queuing work for execution: " + request); checkAndThrowExceptionForTests(request); Map<String, String> env = new HashMap<String, String>(); env.putAll(localEnv); env.put(ApplicationConstants.Environment.USER.name(), request.getUser()); String[] localDirs = new String[localDirsBase.length]; // Setup up local dirs to be application specific, and create them. for (int i = 0; i < localDirsBase.length; i++) { localDirs[i] = createAppSpecificLocalDir(localDirsBase[i], request.getApplicationIdString(), request.getUser()); try { localFs.mkdirs(new Path(localDirs[i])); } catch (IOException e) { throw new TezException(e); } } if (LOG.isDebugEnabled()) { LOG.debug("Dirs are: " + Arrays.toString(localDirs)); } // Setup workingDir. This is otherwise setup as Environment.PWD // Used for re-localization, to add the user specified configuration (conf_pb_binary_stream) String workingDir = localDirs[0]; Credentials credentials = new Credentials(); DataInputBuffer dib = new DataInputBuffer(); byte[] tokenBytes = request.getCredentialsBinary().toByteArray(); dib.reset(tokenBytes, tokenBytes.length); try { credentials.readTokenStorageStream(dib); } catch (IOException e) { throw new TezException(e); } Token<JobTokenIdentifier> jobToken = TokenCache.getSessionToken(credentials); // TODO Unregistering does not happen at the moment, since there's no signals on when an app completes. LOG.info("Registering request with the ShuffleHandler for containerId {}", request.getContainerIdString()); ShuffleHandler.get().registerApplication(request.getApplicationIdString(), jobToken, request.getUser()); TaskRunnerCallable callable = new TaskRunnerCallable(request, new Configuration(getConfig()), new ExecutionContextImpl(localAddress.get().getHostName()), env, localDirs, workingDir, credentials, memoryPerExecutor); ListenableFuture<ContainerExecutionResult> future = executorService.submit(callable); Futures.addCallback(future, new TaskRunnerCallback(request, callable)); }
From source file:org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.java
License:Open Source License
private static void compareKeys(RawComparator<TextBytes> comparator, TextBytes key1, TextBytes key2, int expectedResult) { long nanoStart = System.nanoTime(); Assert.assertEquals(comparator.compare(key1, key2), expectedResult); long nanoEnd = System.nanoTime(); System.out.println("Object Comparison Took:" + (nanoEnd - nanoStart)); DataOutputBuffer outputBuffer1 = new DataOutputBuffer(); DataOutputBuffer outputBuffer2 = new DataOutputBuffer(); try {/*from w w w .j av a 2 s . c o m*/ key1.write(outputBuffer1); key2.write(outputBuffer2); nanoStart = System.nanoTime(); Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(), outputBuffer2.getData(), 0, outputBuffer2.getLength()), expectedResult); nanoEnd = System.nanoTime(); System.out.println("Raw Comparison Took:" + (nanoEnd - nanoStart)); int offset1 = outputBuffer1.getLength(); int offset2 = outputBuffer2.getLength(); key1.write(outputBuffer1); key2.write(outputBuffer2); Assert.assertEquals( comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1, outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2), expectedResult); if (comparator instanceof LinkKeyComparator) { DataInputBuffer inputStream1 = new DataInputBuffer(); DataInputBuffer inputStream2 = new DataInputBuffer(); inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength()); inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength()); CrawlDBKey cdbkey1 = new CrawlDBKey(); CrawlDBKey cdbkey2 = new CrawlDBKey(); cdbkey1.readFields(inputStream1); cdbkey2.readFields(inputStream2); CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator(); System.out.println("*Comparing Using CrawlDBKey Comparator"); nanoStart = System.nanoTime(); Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2), expectedResult); nanoEnd = System.nanoTime(); System.out.println("Typed Comparison Took:" + (nanoEnd - nanoStart)); } } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java
License:Open Source License
private static void rawValueToTextBytes(DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut) throws IOException { inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength()); int newLength = WritableUtils.readVInt(inputBuffer); textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength); }
From source file:org.commoncrawl.mapred.pipelineV3.crawllistgen.GenBundlesStep.java
License:Open Source License
private static void rawValueToWritable(RawRecordValue rawValue, DataInputBuffer inputBuffer, Writable typeOut) throws IOException { inputBuffer.reset(rawValue.data.getData(), rawValue.data.getLength()); typeOut.readFields(inputBuffer);//from w w w . j a va 2s .c o m }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
License:Open Source License
private void iterateHDFSCrawlHistoryLog(long listId, long timestamp, TreeSet<URLFP> criteria, ItemUpdater targetList) throws IOException { // ok copy stuff locally if possible ... File localIndexPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".index"); File localDataPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".data"); File localBloomFilterPath = new File(getLocalDataDir(), CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp + ".bloom"); SequenceFile.Reader reader = null; Path mapFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_LOGFILE_PREFIX + timestamp); Path indexFilePath = new Path(mapFilePath, "index"); Path dataFilePath = new Path(mapFilePath, "data"); Path bloomFilePath = new Path(_remoteDataDirectory, CRAWL_HISTORY_HDFS_BLOOMFILTER_PREFIX + timestamp); // ok copy local first if (!localIndexPath.exists()) { LOG.info("LIST:" + listId + " Copying Index File:" + indexFilePath + " to Local:" + localIndexPath.getAbsolutePath()); try {// ww w .ja va 2s . c o m _remoteFileSystem.copyToLocalFile(indexFilePath, new Path(localIndexPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localIndexPath.delete(); throw e; } } if (!localDataPath.exists()) { LOG.info("LIST:" + listId + " Copying Data File:" + dataFilePath + " to Local:" + localDataPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(dataFilePath, new Path(localDataPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localDataPath.delete(); throw e; } } if (!localBloomFilterPath.exists()) { LOG.info("LIST:" + listId + " Copying Bloom File:" + bloomFilePath + " to Local:" + localBloomFilterPath.getAbsolutePath()); try { _remoteFileSystem.copyToLocalFile(bloomFilePath, new Path(localBloomFilterPath.getAbsolutePath())); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); localBloomFilterPath.delete(); throw e; } } // ok open local FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()); SequenceFile.Reader indexReader = new SequenceFile.Reader(localFileSystem, new Path(localIndexPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); try { URLFP firstIndexKey = null; URLFP lastIndexKey = new URLFP(); LongWritable position = new LongWritable(); while (indexReader.next(lastIndexKey, position)) { if (firstIndexKey == null) { try { firstIndexKey = (URLFP) lastIndexKey.clone(); } catch (CloneNotSupportedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } LOG.info("LIST:" + listId + " ### Index First Domain:" + firstIndexKey.getDomainHash() + " URLHash:" + firstIndexKey.getUrlHash() + " Last Domain:" + lastIndexKey.getDomainHash() + " URLHash:" + lastIndexKey.getUrlHash()); URLFP criteriaFirstKey = criteria.first(); URLFP criteriaLastKey = criteria.last(); if (firstIndexKey.compareTo(criteriaLastKey) > 0 || lastIndexKey.compareTo(criteriaFirstKey) < 0) { LOG.info("LIST:" + listId + " Entire Index is Out of Range. Skipping!"); LOG.info("LIST:" + listId + " ### Criteria First Domain:" + criteriaFirstKey.getDomainHash() + " URLHash:" + criteriaFirstKey.getUrlHash() + " Last Domain:" + criteriaLastKey.getDomainHash() + " URLHash:" + criteriaLastKey.getUrlHash()); return; } } finally { indexReader.close(); } LOG.info("LIST:" + listId + " ### Index:" + timestamp + " Passed Test. Doing Full Scan"); // load bloom filter FSDataInputStream bloomFilterStream = localFileSystem .open(new Path(localBloomFilterPath.getAbsolutePath())); int hitCount = 0; try { URLFPBloomFilter filter = URLFPBloomFilter.load(bloomFilterStream); URLFP fpOut = new URLFP(); ProxyCrawlHistoryItem itemOut = new ProxyCrawlHistoryItem(); DataOutputBuffer valueBytesUncompressed = new DataOutputBuffer(); ValueBytes valueBytes = null; DataInputBuffer valueReader = new DataInputBuffer(); DataOutputBuffer keyBytes = new DataOutputBuffer(); DataInputBuffer keyReader = new DataInputBuffer(); URLFP lastFP = null; outerLoop: // now iterate each item in the criteria for (URLFP targetFP : criteria) { // if fingerprint is present in filter ... if (filter.isPresent(targetFP)) { // check to see if reader is initialzied ... if (reader == null) { LOG.info("LIST:" + listId + " BloomFilter First Hit. Initializing Reader for file at:" + localDataPath.getAbsolutePath()); reader = new SequenceFile.Reader(localFileSystem, new Path(localDataPath.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("LIST:" + listId + " BloomFilter First Hit. Initialized Reader for file at:" + localDataPath.getAbsolutePath()); valueBytes = reader.createValueBytes(); } // if last read fingerprint was not null ... if (lastFP != null) { // does it match the current item if (lastFP.compareTo(targetFP) == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + +lastFP.getUrlHash() + " File:" + dataFilePath); // if so, null out last fp lastFP = null; // and update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; continue; } } // ok at this point .. read the next item in the list ... lastFP = null; while (reader.nextRaw(keyBytes, valueBytes) != -1) { // init reader ... keyReader.reset(keyBytes.getData(), keyBytes.getLength()); // read key fpOut.readFields(keyReader); // reset output buffer keyBytes.reset(); // LOG.info("LIST:" + listId +" nextRaw Returned DH:" + // fpOut.getDomainHash() + " UH:" + fpOut.getUrlHash() + " TDH:" + // targetFP.getDomainHash() + " TUH:" + targetFP.getUrlHash()); // compare it to target ... int result = fpOut.compareTo(targetFP); // ok does it match .. ? if (result == 0) { // decompress value bytes ... valueBytesUncompressed.reset(); valueBytes.writeUncompressedBytes(valueBytesUncompressed); // init valueReader valueReader.reset(valueBytesUncompressed.getData(), valueBytesUncompressed.getLength()); itemOut.readFields(valueReader); LOG.info("LIST:" + listId + " GOT HISTORY ITEM HIT. URL:" + fpOut.getUrlHash() + " File:" + dataFilePath); // update item state ... targetList.updateItemState(targetFP, itemOut); hitCount++; // and break to outer loop continue outerLoop; } else if (result == 1) { // LOG.info("LIST:" + listId + // " FP Comparison Returned 1. Going to OuterLoop"); // update last FP lastFP = fpOut; // continue outer loop continue outerLoop; } else { // otherwise skip } } // ok if we got here .. we are done reading the sequence file and did // not find a trailing match LOG.warn("LIST:" + listId + " ### Reached End Of File Searching for item in MapFile while BloomFilter returned positivie result (DomainHash:" + targetFP.getDomainHash() + "FP:" + targetFP.getUrlHash() + ")"); // break out of outer loop break; } } } finally { bloomFilterStream.close(); if (reader != null) { reader.close(); } LOG.info("LIST:" + listId + " File:" + dataFilePath + " DONE. HitCount:" + hitCount); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private final int getOffsetForSubDomainData(long domainHash) throws IOException { DataInputBuffer inputBuffer = new DataInputBuffer(); int low = 0;/*from w w w . j a v a 2 s.c om*/ int high = (int) (_offsetLookupTable.getLength() / OFFSET_TABLE_ENTRY_SIZE) - 1; while (low <= high) { int mid = low + ((high - low) / 2); inputBuffer.reset(_offsetLookupTable.getData(), _offsetLookupTable.getLength()); inputBuffer.skip(mid * OFFSET_TABLE_ENTRY_SIZE); // deserialize long hash = inputBuffer.readLong(); // now compare it against desired hash value ... int comparisonResult = ((Long) hash).compareTo(domainHash); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { return inputBuffer.readInt(); } } throw new IOException("NOT-FOUND!"); }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void resetSubDomainCounts() throws IOException { LOG.info("*** LIST:" + getListId() + " Reset SubDomain Queued Counts."); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS ."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); try {/*from www . j a v a 2 s . co m*/ // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); inputBuffer.reset(outputBuffer.getData(), CrawlListMetadata.Constants.FixedDataSize); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } // ok reset everything except hashes and first/last url pointers int urlCount = newMetadata.getUrlCount(); long firstRecordOffset = newMetadata.getFirstRecordOffset(); long lastRecordOffset = newMetadata.getLastRecordOffset(); String domainName = newMetadata.getDomainName(); long domainHash = newMetadata.getDomainHash(); // reset newMetadata.clear(); // restore newMetadata.setUrlCount(urlCount); newMetadata.setFirstRecordOffset(firstRecordOffset); newMetadata.setLastRecordOffset(lastRecordOffset); newMetadata.setDomainName(domainName); newMetadata.setDomainHash(domainHash); // serialize it ... outputBuffer.reset(); newMetadata.serialize(outputBuffer, new BinaryProtocol()); // write it back to disk file.seek(orignalPos); // and rewrite it ... file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE RESETTIGN SUBDOMAIN METADATA QUEUE COUNTS"); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void loadSubDomainMetadataFromDisk() throws IOException { LOG.info("*** LIST:" + getListId() + " LOAD SUBDOMAIN METADATA FROM DISK ... "); if (_subDomainMetadataFile.exists()) { LOG.info("*** LIST:" + getListId() + " FILE EXISTS LOADING SUBDOMAIN DATA FROM DISK."); RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try {//from www . ja v a 2s .c o m // skip version file.read(); // read item count int itemCount = file.readInt(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); CrawlListMetadata newMetadata = new CrawlListMetadata(); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (int i = 0; i < itemCount; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); try { newMetadata.deserialize(inputBuffer, new BinaryProtocol()); } catch (Exception e) { LOG.error("-----Failed to Deserialize Metadata at Index:" + i + " Exception:" + CCStringUtils.stringifyException(e)); } idToOffsetMap.put(newMetadata.getDomainHash(), (int) orignalPos); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); } else { LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA DOES NOT EXIST! LOADING FROM SCRATCH"); RandomAccessFile fixedDataReader = new RandomAccessFile(_fixedDataFile, "rw"); RandomAccessFile stringDataReader = new RandomAccessFile(_variableDataFile, "rw"); try { //ok rebuild top level metadata as well _metadata.clear(); OnDiskCrawlHistoryItem item = new OnDiskCrawlHistoryItem(); int processedCount = 0; while (fixedDataReader.getFilePointer() != fixedDataReader.length()) { long position = fixedDataReader.getFilePointer(); // store offset in item item._fileOffset = position; // load from disk item.deserialize(fixedDataReader); try { // seek to string data stringDataReader.seek(item._stringsOffset); // and skip buffer length WritableUtils.readVInt(stringDataReader); // and read primary string String url = stringDataReader.readUTF(); // get metadata object for subdomain CrawlListMetadata subDomainMetadata = getTransientSubDomainMetadata(url); // increment url count subDomainMetadata.setUrlCount(subDomainMetadata.getUrlCount() + 1); // increment top level metadata count _metadata.setUrlCount(_metadata.getUrlCount() + 1); // update top level metadata .. updateMetadata(item, _metadata, 0); // update sub-domain metadata object from item data updateMetadata(item, subDomainMetadata, 0); ++processedCount; } catch (IOException e) { LOG.error("Exception Reading String Data For Item:" + (processedCount + 1)); LOG.error("Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); } if (processedCount % 10000 == 0) { LOG.info("*** LIST:" + getListId() + " Processed:" + processedCount + " Items"); } } // ok commit top level metadata to disk as well writeMetadataToDisk(); } catch (IOException e) { LOG.error("Encountered Exception Queueing Items for List:" + _listId + " Exception:" + CCStringUtils.stringifyException(e)); LOG.error("File Position:" + fixedDataReader.getFilePointer() + " StringsPointer:" + stringDataReader.getFilePointer()); _queueState = QueueState.QUEUED; } finally { fixedDataReader.close(); stringDataReader.close(); } LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITING TO DISK"); // write metadat to disk writeInitialSubDomainMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN METADATA REBUILT FROM LIST DATA . WRITE COMPLETE"); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
public ArrayList<CrawlListDomainItem> getSubDomainList(int offset, int count) { synchronized (_metadata) { ArrayList<CrawlListDomainItem> itemsOut = new ArrayList<CrawlListDomainItem>(); try {//w ww . j a v a 2 s . c o m synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); DataInputBuffer inputBuffer = new DataInputBuffer(); byte fixedDataBlock[] = new byte[CrawlListMetadata.Constants.FixedDataSize]; try { // skip version file.read(); // read item count int itemCount = file.readInt(); int i = offset; int end = Math.min(i + count, itemCount); LOG.info("*** LIST:" + getListId() + " SUBDOMAIN ITEM COUNT:" + itemCount); if (i < itemCount) { file.seek(5 + (CrawlListMetadata.Constants.FixedDataSize * offset)); CrawlListMetadata newMetadata = new CrawlListMetadata(); for (; i < end; ++i) { long orignalPos = file.getFilePointer(); file.readFully(fixedDataBlock, 0, fixedDataBlock.length); inputBuffer.reset(fixedDataBlock, fixedDataBlock.length); newMetadata.deserialize(inputBuffer, new BinaryProtocol()); itemsOut.add(buildSubDomainSummary(newMetadata.getDomainName(), newMetadata)); } } } finally { file.close(); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } LOG.info("*** LIST:" + getListId() + " DONE LOADING SUBDOMAIN DATA FROM DISK"); return itemsOut; } }
From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java
License:Open Source License
public static void main(String[] args) { if (args.length != 3) { LOG.error("args: [candidate Timestamp] [drive count] [query string]"); }/*from w w w. ja va 2 s. c o m*/ // initialize ... final Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure(); CrawlEnvironment.setHadoopConfig(conf); long candidateTS = Long.parseLong(args[0]); int driveCount = Integer.parseInt(args[1]); String queryString = args[2]; try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS, null); SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS); // ok hit the domain against the master index first ... LOG.info("Querying master index for DomainId Given DomainName:" + queryString); long domainId = masterIndex.queryDomainIdGivenDomain(queryString); LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId); SubDomainMetadata subDomainMeta = masterIndex.queryDomainMetadataGivenDomainId(domainId); if (subDomainMeta != null) { LOG.info("Metadata is present. Deserializing"); // dump some fields ... LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount() + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:" + subDomainMeta.getHasPageRankCount()); // ok time to dive into a url list ... // query for a list of urls sorted by name LOG.info("Querying for URLList for Domain BY PR"); FlexBuffer urlListBufferByPR = slaveIndex.queryURLListSortedByPR(domainId); if (urlListBufferByPR != null) { // read the list ... DataInputBuffer readerStream = new DataInputBuffer(); readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR.getCount()); int totalItemCount = urlListBufferByPR.getCount() / 8; System.out.println("List BY PR totalCount:" + totalItemCount); // initialize a fingerprint object to use for queries ... URLFPV2 queryFP = new URLFPV2(); queryFP.setDomainHash(domainId); DataInputBuffer metadataReaderStream = new DataInputBuffer(); // iterate the first N items ranked by page rank for (int i = 0; i < Math.min(10, totalItemCount); ++i) { queryFP.setUrlHash(readerStream.readLong()); // and for metadata MetadataOut urlMetadata = masterIndex.queryMetadataAndURLGivenFP(queryFP); if (urlMetadata != null) { // decode the url String url = urlMetadata.url.toString(); System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url); if (urlMetadata.datumAndMetadataBytes.getLength() == 0) { System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!"); } else { // explode metadata CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata(); metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes.getBytes(), urlMetadata.datumAndMetadataBytes.getOffset(), urlMetadata.datumAndMetadataBytes.getLength()); metadataObject.readFields(metadataReaderStream); // ok at this point spit out stuff for this url StringBuilder urlInfo = new StringBuilder(); urlInfo.append(" FetchStatus:" + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n"); urlInfo.append(" PageRank:" + metadataObject.getMetadata().getPageRank() + "\n"); urlInfo.append( " ContentType:" + metadataObject.getMetadata().getContentType() + "\n"); urlInfo.append(" ArcFileInfoCount:" + metadataObject.getMetadata().getArchiveInfo().size()); if (metadataObject.getMetadata() .isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) { urlInfo.append( " HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo() + ":" + metadataObject.getMetadata().getLinkDBOffset()); } if (metadataObject.getMetadata() .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) { urlInfo.append(" HasINVLinkDataInfo:" + metadataObject.getMetadata().getInverseDBFileNo() + ":" + metadataObject.getMetadata().getInverseDBOffset()); } System.out.println(urlInfo.toString()); // now if inverse link data is present .. if (metadataObject.getMetadata() .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) { // get it ... System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash()); FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP, metadataObject.getMetadata().getInverseDBFileNo(), metadataObject.getMetadata().getInverseDBOffset()); if (inlinks != null) { System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount()); FileSystem localFS = FileSystem.getLocal(conf); File testDir = new File("/tmp/dbIndexTest"); File testFile = new File("/tmp/dbIndexTestFile"); localFS.delete(new Path(testDir.getAbsolutePath()), true); localFS.delete(new Path(testFile.getAbsolutePath()), false); localFS.mkdirs(new Path(testDir.getAbsolutePath())); LOG.info("Creating Spill File of Inlinks"); spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir, new Path(testFile.getAbsolutePath()), inlinks); LOG.info("Created Spill File of Inlinks"); LOG.info("Reading Inlinks"); // ok now open it up and dump the first few inlinks from the // spill file SequenceFile.Reader reader = new SequenceFile.Reader(localFS, new Path(testFile.getAbsolutePath()), conf); TextBytes key = new TextBytes(); TriTextBytesTuple value = new TriTextBytesTuple(); CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata(); DataInputBuffer inputBuffer = new DataInputBuffer(); try { int itemCount = 0; while (reader.next(key, value)) { if (value.getThirdValue().getLength() != 0) { inputBuffer.reset(value.getThirdValue().getBytes(), 0, value.getThirdValue().getLength()); metadata.readFields(inputBuffer); System.out.println("INLINK:" + key.toString() + " METADATA STATUS:" + CrawlDatum.getStatusName(metadata.getStatus())); } else { System.out.println("INLINK:" + key.toString() + " NOMETADATA"); } if (++itemCount == 500) { break; } } } finally { reader.close(); } LOG.info("Done Reding Inlinks"); } } } } else { LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL"); } } } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }