List of usage examples for org.apache.hadoop.io DataOutputBuffer getLength
public int getLength()
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * Initialize a new CrawlList object from a given input stream of urls * /*from www.j av a 2 s .c om*/ * @param manager - reference to the crawl history log manager * @param urlInputStream - the input stream containing the list of urls that we should add to this list ... * @throws IOException */ public CrawlList(CrawlHistoryStorage manager, long listId, File sourceURLFile, int refreshInterval) throws IOException { _manager = manager; _listState = LoadState.REALLY_LOADING; // initialize a new list id _listId = listId; LOG.info("*** LIST:" + getListId() + " LOADING FROM SOURCE FILE:" + sourceURLFile.getAbsolutePath()); //establish file names initializeListFileNames(); sourceURLFile.renameTo(_listURLDataFile); FileInputStream urlInputStream = new FileInputStream(_listURLDataFile); try { // set we will use to hold all fingerprints generated TreeSet<URLFP> urlSet = new TreeSet<URLFP>(); // create temp files ... File spillOutputFile = File.createTempFile("spillOut", Long.toString(_listId)); // create mergesortspillwriter SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem> spillwriter = new SequenceFileSpillWriter<URLFP, ProxyCrawlHistoryItem>( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), CrawlEnvironment.getHadoopConfig(), new Path(spillOutputFile.getAbsolutePath()), URLFP.class, ProxyCrawlHistoryItem.class, null, false); try { MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem> merger = new MergeSortSpillWriter<URLFP, ProxyCrawlHistoryItem>( CrawlEnvironment.getHadoopConfig(), spillwriter, FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(manager.getLocalDataDir().getAbsolutePath()), null, new RawKeyValueComparator<URLFP, ProxyCrawlHistoryItem>() { DataInputBuffer _key1Buffer = new DataInputBuffer(); DataInputBuffer _key2Buffer = new DataInputBuffer(); @Override public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data, int value1Offset, int value1Length, byte[] value2Data, int value2Offset, int value2Length) throws IOException { _key1Buffer.reset(key1Data, key1Offset, key1Length); _key2Buffer.reset(key2Data, key2Offset, key2Length); _key1Buffer.skip(2); // skip verison, and 1 byte id _key2Buffer.skip(2); // skip verison, and 1 byte id int domainHash1 = WritableUtils.readVInt(_key1Buffer); int domainHash2 = WritableUtils.readVInt(_key2Buffer); _key1Buffer.skip(1); // skip 1 byte id _key2Buffer.skip(1); // skip 1 byte id long fingerprint1 = WritableUtils.readVLong(_key1Buffer); long fingerprint2 = WritableUtils.readVLong(_key2Buffer); int result = ((Integer) domainHash1).compareTo(domainHash2); if (result == 0) { result = ((Long) fingerprint1).compareTo(fingerprint2); } return result; } @Override public int compare(URLFP key1, ProxyCrawlHistoryItem value1, URLFP key2, ProxyCrawlHistoryItem value2) { return key1.compareTo(key2); } }, URLFP.class, ProxyCrawlHistoryItem.class, false, null); try { LOG.info("*** LIST:" + getListId() + " Starting Scan of URLS In List"); BufferedReader reader = new BufferedReader( new InputStreamReader(urlInputStream, Charset.forName("UTF-8"))); String line = null; int lineNumber = 0; ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); while ((line = reader.readLine()) != null) { ++lineNumber; if (line.length() != 0 && !line.startsWith("#")) { URLFP fingerprint = URLUtils.getURLFPFromURL(line, true); if (fingerprint != null) { if (!urlSet.contains(fingerprint)) { // and add fingerprint to set urlSet.add(fingerprint); // initialize item item.clear(); item.setOriginalURL(line); // and spill to merger / sorter .. merger.spillRecord(fingerprint, item); } } else { LOG.error("*** LIST:" + getListId() + " Invalid URL Encounered at Line:" + lineNumber + " URL" + line); } } } LOG.info("*** LIST:" + getListId() + " Completed Scan of:" + urlSet.size() + " URLS"); } finally { merger.close(); } } finally { if (spillwriter != null) spillwriter.close(); } LOG.info("*** LIST:" + getListId() + " Generating BloomFilter for:" + urlSet.size() + " keys"); // generate bloom filter ... _bloomFilter = new URLFPBloomFilter(urlSet.size(), 7, 10); for (URLFP fingerprint : urlSet) { _bloomFilter.add(fingerprint); } LOG.info("*** LIST:" + getListId() + " Serializing BloomFilter"); // serialize it FileOutputStream bloomFilterStream = new FileOutputStream(_bloomFilterData); try { _bloomFilter.serialize(bloomFilterStream); } finally { bloomFilterStream.flush(); bloomFilterStream.close(); } LOG.info("*** LIST:" + getListId() + " Starting Read of Merged Sequence File:" + spillOutputFile); // now initialize value map and string maps based on output sequence file ... SequenceFile.Reader reader = new SequenceFile.Reader( FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()), new Path(spillOutputFile.getAbsolutePath()), CrawlEnvironment.getHadoopConfig()); LOG.info("*** LIST:" + getListId() + " PRE-ALLOCATING FIXED DATA BUFFER OF SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); // OK, Allocate room for fixed data file upfront DataOutputBuffer valueStream = new DataOutputBuffer( urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE); LOG.info("*** LIST:" + getListId() + " ALLOCATION SUCCEEDED"); try { //DataOutputStream valueStream = new DataOutputStream(new FileOutputStream(_fixedDataFile)); RandomAccessFile stringsStream = new RandomAccessFile(_variableDataFile, "rw"); try { URLFP urlFP = new URLFP(); ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); // read fingerprints ... while (reader.next(urlFP, item)) { // write out fixed data structure and strings writeInitialOnDiskItem(urlFP, item, valueStream, stringsStream); } } finally { //valueStream.flush(); //valueStream.close(); stringsStream.close(); } } finally { reader.close(); } LOG.info("*** LIST:" + getListId() + " Finished Writing Initial Values to Disk"); LOG.info("*** LIST:" + getListId() + " FIXED DATA BUFFER OF SIZE:" + valueStream.getLength() + " EXCEPECTED SIZE:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); if (valueStream.getLength() != (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)) { throw new IOException("Final FixedItemData Buffer Size:" + valueStream.getLength() + " != URLSetSize:" + (urlSet.size() * OnDiskCrawlHistoryItem.ON_DISK_SIZE)); } // initialize temp data buffer variables _tempFixedDataBuffer = valueStream.getData(); _tempFixedDataBufferSize = valueStream.getLength(); // update metadata _metadata.setRefreshInterval(refreshInterval); _metadata.setUrlCount(urlSet.size()); // setup version _metadata.setVersion(1); // and write to disk writeMetadataToDisk(); // mark state as loaded ... _listState = LoadState.LOADED; LOG.info("*** LIST:" + getListId() + " SYNCING"); // reconcile with history log _manager.syncList(this.getListId(), urlSet, this); LOG.info("*** LIST:" + getListId() + " SYNC COMPLETE"); // write metdata to disk again writeMetadataToDisk(); LOG.info("*** LIST:" + getListId() + " FLUSHING FIXED DATA"); // and finally flush fixed data to disk FileOutputStream finalDataStream = new FileOutputStream(_fixedDataFile); try { synchronized (this) { int blockSize = 1 << 20; long bytesCopied = 0; for (int offset = 0; offset < _tempFixedDataBufferSize; offset += blockSize) { int bytesToCopy = Math.min(blockSize, _tempFixedDataBufferSize - offset); finalDataStream.write(_tempFixedDataBuffer, offset, bytesToCopy); bytesCopied += bytesToCopy; } // validate bytes copied if (bytesCopied != _tempFixedDataBufferSize) { throw new IOException("Buffer Size:" + _tempFixedDataBufferSize + " Does not Match BytesCopied:" + bytesCopied); } // ok release the buffer _tempFixedDataBuffer = null; _tempFixedDataBufferSize = 0; LOG.info("*** LIST:" + getListId() + " FIXED DATA FLUSH COMPLETE"); } } finally { finalDataStream.flush(); finalDataStream.close(); } // load sub domain metadata from disk ... loadSubDomainMetadataFromDisk(); } catch (IOException e) { LOG.error("*** LIST:" + getListId() + " Crawl List Initialization Failed With Exception:" + CCStringUtils.stringifyException(e)); _fixedDataFile.delete(); _variableDataFile.delete(); _bloomFilterData.delete(); _listState = LoadState.ERROR; throw e; } finally { urlInputStream.close(); } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
private int calculateStringCRC(ProxyCrawlHistoryItem item, DataOutputBuffer stringBuffer) throws IOException { stringBuffer.reset();//from ww w . java 2s . c o m stringBuffer.writeUTF(item.getOriginalURL()); if (item.isFieldDirty(ProxyCrawlHistoryItem.Field_REDIRECTURL)) { stringBuffer.writeUTF(item.getRedirectURL()); } _stringCRC.reset(); _stringCRC.update(stringBuffer.getData(), 0, stringBuffer.getLength()); return (int) _stringCRC.getValue(); }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
/** * serialize metadata to disk //from ww w. j a v a2 s. c o m * @throws IOException */ void writeSubDomainMetadataToDisk(CrawlListMetadata subDomainData) throws IOException { DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); subDomainData.serialize(outputBuffer, new BinaryProtocol()); if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { LOG.error("ListMetadata Serialize for List:" + subDomainData.getDomainName() + " > FixedDataSize!!!"); outputBuffer.reset(); subDomainData.setDomainName("<<CORRUPT>>"); subDomainData.serialize(outputBuffer, new BinaryProtocol()); } synchronized (_subDomainMetadataFile) { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try { if (subDomainData.getSubDomainDataOffset() == 0) { throw new IOException("Data Offset Zero during write!"); } file.seek(subDomainData.getSubDomainDataOffset()); file.write(outputBuffer.getData(), 0, outputBuffer.getLength()); } finally { file.close(); } } }
From source file:org.commoncrawl.service.listcrawler.CrawlList.java
License:Open Source License
void writeInitialSubDomainMetadataToDisk() throws IOException { RandomAccessFile file = new RandomAccessFile(_subDomainMetadataFile, "rw"); try {/*from www .j a va 2s . c o m*/ file.writeByte(0); // version file.writeInt(_transientSubDomainStats.size()); ArrayList<CrawlListMetadata> sortedMetadata = new ArrayList<CrawlListMetadata>(); sortedMetadata.addAll(_transientSubDomainStats.values()); _transientSubDomainStats = null; CrawlListMetadata metadataArray[] = sortedMetadata.toArray(new CrawlListMetadata[0]); Arrays.sort(metadataArray, new Comparator<CrawlListMetadata>() { @Override public int compare(CrawlListMetadata o1, CrawlListMetadata o2) { int result = ((Integer) o2.getUrlCount()).compareTo(o1.getUrlCount()); if (result == 0) { result = o1.getDomainName().compareTo(o2.getDomainName()); } return result; } }); DataOutputBuffer outputBuffer = new DataOutputBuffer(CrawlListMetadata.Constants.FixedDataSize); TreeMap<Long, Integer> idToOffsetMap = new TreeMap<Long, Integer>(); for (CrawlListMetadata entry : metadataArray) { // reset output buffer outputBuffer.reset(); // write item to disk entry.serialize(outputBuffer, new BinaryProtocol()); if (outputBuffer.getLength() > CrawlListMetadata.Constants.FixedDataSize) { LOG.fatal("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName()); System.out.println("Metadata Serialization for List:" + getListId() + " SubDomain:" + entry.getDomainName()); } // save offset idToOffsetMap.put(entry.getDomainHash(), (int) file.getFilePointer()); // write out fixed data size file.write(outputBuffer.getData(), 0, CrawlListMetadata.Constants.FixedDataSize); } // write lookup table _offsetLookupTable = new DataOutputBuffer(idToOffsetMap.size() * OFFSET_TABLE_ENTRY_SIZE); for (Map.Entry<Long, Integer> entry : idToOffsetMap.entrySet()) { _offsetLookupTable.writeLong(entry.getKey()); _offsetLookupTable.writeInt(entry.getValue()); } } finally { file.close(); } _transientSubDomainStats = null; }
From source file:org.commoncrawl.service.parser.client.Dispatcher.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); CrawlEnvironment.setHadoopConfig(conf); String baseURL = "http://unknown.com/"; if (args.length != 0) { baseURL = args[0];/* ww w .ja va 2 s. c o m*/ } URL baseURLObj; try { baseURLObj = new URL(baseURL); } catch (MalformedURLException e2) { throw new IOException("Invalid Base Link"); } final URL finalBaseURL = (baseURLObj != null) ? baseURLObj : null; final DataOutputBuffer headerBuffer = new DataOutputBuffer(); final DataOutputBuffer contentBuffer = new DataOutputBuffer(); try { ByteStreams.readBytes(new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return System.in; } }, new ByteProcessor<Long>() { @Override public Long getResult() { return 0L; } int currLineCharCount = 0; boolean processingHeaders = true; @Override public boolean processBytes(byte[] buf, int start, int length) throws IOException { if (processingHeaders) { int current = start; int end = current + length; while (processingHeaders && current != end) { if (buf[current] != '\r' && buf[current] != '\n') { currLineCharCount++; } else if (buf[current] == '\n') { if (currLineCharCount == 0) { headerBuffer.write(buf, start, current - start + 1); processingHeaders = false; } currLineCharCount = 0; } current++; } if (processingHeaders) { headerBuffer.write(buf, start, length); } else { length -= current - start; start = current; } } if (!processingHeaders) { contentBuffer.write(buf, start, length); } return true; } }); LOG.info("HEADER LEN:" + headerBuffer.getLength()); // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8"))); LOG.info("CONTENT LEN:" + contentBuffer.getLength()); //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8"))); // decode header bytes ... String header = ""; if (headerBuffer.getLength() != 0) { try { header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("UTF-8")); } catch (Exception e) { LOG.warn(CCStringUtils.stringifyException(e)); header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("ASCII")); } } final String headersFinal = (header != null) ? header : ""; LOG.info("Starting Event Loop"); final EventLoop eventLoop = new EventLoop(); eventLoop.start(); try { // create fake hosts file ... //String hosts = "10.0.20.101:8072"; // reader //Reader reader = new StringReader(hosts); // dispatcher init LOG.info("initializing Dispatcher"); final Dispatcher dispatcher = new Dispatcher(eventLoop, "parserNodes"); LOG.info("Waiting for a few seconds"); Thread.sleep(5000); Thread threads[] = new Thread[TEST_THREAD_COUNT]; final Semaphore threadWaitSem = new Semaphore(-TEST_THREAD_COUNT - 1); // start 100 threads for (int threadIdx = 0; threadIdx < TEST_THREAD_COUNT; ++threadIdx) { threads[threadIdx] = new Thread(new Runnable() { @Override public void run() { for (int i = 0; i < ITERATIONS_PER_THREAD; ++i) { // build parse request ParseRequest request = new ParseRequest(); request.setDocId(1); request.setDomainId(1); request.setDocURL(finalBaseURL.toString()); request.setDocHeaders(headersFinal); request.setDocContent( new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength())); //LOG.info("Dispatching parse request"); ParseResult result = dispatcher.dispatchRequest(request); LOG.info("TID[" + Thread.currentThread().getId() + "]ReqID[" + i + "]" + " Success:" + ((result != null) ? result.getParseSuccessful() : false) + " LinkCount:" + ((result != null) ? result.getExtractedLinks().size() : 0)); } LOG.info("Thread:" + Thread.currentThread().getId() + " Exiting"); threadWaitSem.release(); } }); threads[threadIdx].start(); } LOG.info("Waiting for threads to die"); threadWaitSem.acquireUninterruptibly(); LOG.info("All Threads dead."); } finally { eventLoop.stop(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } catch (InterruptedException e) { } }
From source file:org.commoncrawl.service.parser.server.ParseWorker.java
License:Open Source License
public static void main(String[] args) throws IOException { String baseURL = "http://unknown.com/"; if (args.length != 0) { baseURL = args[0];/*from w w w .j av a 2s.c o m*/ } URL baseURLObj; try { baseURLObj = new URL(baseURL); } catch (MalformedURLException e2) { throw new IOException("Invalid Base Link"); } final DataOutputBuffer headerBuffer = new DataOutputBuffer(); final DataOutputBuffer contentBuffer = new DataOutputBuffer(); try { ByteStreams.readBytes(new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return System.in; } }, new ByteProcessor<Long>() { @Override public Long getResult() { return 0L; } int currLineCharCount = 0; boolean processingHeaders = true; @Override public boolean processBytes(byte[] buf, int start, int length) throws IOException { if (processingHeaders) { int current = start; int end = current + length; while (processingHeaders && current != end) { if (buf[current] != '\r' && buf[current] != '\n') { currLineCharCount++; } else if (buf[current] == '\n') { if (currLineCharCount == 0) { headerBuffer.write(buf, start, current - start + 1); processingHeaders = false; } currLineCharCount = 0; } current++; } if (processingHeaders) { headerBuffer.write(buf, start, length); } else { length -= current - start; start = current; } } if (!processingHeaders) { contentBuffer.write(buf, start, length); } return true; } }); //LOG.info("HEADER LEN:" + headerBuffer.getLength()); // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8"))); //LOG.info("CONTENT LEN:" + contentBuffer.getLength()); //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8"))); // decode header bytes ... String header = ""; if (headerBuffer.getLength() != 0) { try { header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("UTF-8")); } catch (Exception e) { LOG.warn(CCStringUtils.stringifyException(e)); header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("ASCII")); } } //LOG.info("Parsing Document"); ParseWorker worker = new ParseWorker(); ParseResult result = new ParseResult(); worker.parseDocument(result, 0L, 0L, baseURLObj, header, new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength())); //LOG.info("Parse Result:" + result.getParseSuccessful()); //LOG.info("Parse Data:" + result.toString()); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } /* List<String> lines; try { lines = IOUtils.readLines(System.in, "UTF-8"); for (String line : lines){ System.out.println(line); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }*/ }
From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java
License:Open Source License
static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId, long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException { File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis()); tempFile.mkdir();/*from ww w. j a v a 2s. c o m*/ try { // create the final output spill writer ... SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>( outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class, new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem, PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)), true); try { MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>( conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null, new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null); try { for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) { // 0. shard domain id to find index file location ... int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE) % CrawlEnvironment.NUM_DB_SHARDS); // build path to index file Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId)); LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:" + indexFilePath); // 1. scan domainFP to index file first // 2. given index, scan index->pos file to find scan start position // 3. given scan start position, scan forward until fp match is found. // 4. collect all matching entries and output to a file ? FSDataInputStream indexDataInputStream = fs.open(indexFilePath); try { TFile.Reader reader = new TFile.Reader(indexDataInputStream, fs.getLength(indexFilePath), conf); try { TFile.Reader.Scanner scanner = reader.createScanner(); try { // generate key ... DataOutputBuffer keyBuffer = new DataOutputBuffer(); keyBuffer.writeLong(targetRootDomainFP); if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) { // setup for value scan DataInputStream valueStream = scanner.entry().getValueStream(); int dataOffsetOut = -1; while (valueStream.available() > 0) { // read entries looking for our specific entry int shardIdx = valueStream.readInt(); int dataOffset = valueStream.readInt(); if (shardIdx == targetShardId) { dataOffsetOut = dataOffset; break; } } LOG.info("Index Search Yielded:" + dataOffsetOut); if (dataOffsetOut != -1) { // ok create a data path Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId)); Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId + "/phase2Data/data-" + NUMBER_FORMAT.format(targetShardId) + ".index"); // check to see if index is already loaded ... PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null; synchronized (_shardToIndexMap) { index = _shardToIndexMap.get(targetShardId); } if (index == null) { LOG.info("Loading Index from Path:" + finalDataIndexPath); // load index index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>( fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class); // put in cache synchronized (_shardToIndexMap) { _shardToIndexMap.put(targetShardId, index); } } LOG.info("Initializing Data Reader at Path:" + finalDataPath); // ok time to create a reader SequenceFile.Reader dataReader = new SequenceFile.Reader(fs, finalDataPath, conf); try { LOG.info("Seeking Reader to Index Position:" + dataOffsetOut); index.seekReaderToItemAtIndex(dataReader, dataOffsetOut); FlexBuffer keyBytes = new FlexBuffer(); URLFPV2 sourceFP = new URLFPV2(); DataInputBuffer keyReader = new DataInputBuffer(); TextBytes urlTxt = new TextBytes(); // ok read to go ... while (dataReader.next(keyBytes, sourceFP)) { // initialize reader keyReader.reset(keyBytes.get(), keyBytes.getOffset(), keyBytes.getCount()); long targetFP = keyReader.readLong(); if (targetRootDomainFP == targetFP) { finalMerger.spillRecord(keyBytes, sourceFP); } else { LOG.info("FP:" + targetFP + " > TargetFP:" + targetRootDomainFP + " Exiting Iteration Loop"); break; } } } finally { LOG.info("Closing Reader"); dataReader.close(); } } } } finally { LOG.info("Closing Scanner"); scanner.close(); } } finally { LOG.info("Closing TFile Reader"); reader.close(); } } finally { LOG.info("Closing InputStream"); indexDataInputStream.close(); } } } finally { finalMerger.close(); } } finally { spillwriter.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); FileUtils.recursivelyDeleteFile(tempFile); } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private long runInlinksLocalQuery(DatabaseIndexV2.MasterDatabaseIndex index, FileSystem inputFileSystem, Path inlinksInputPath, FileSystem outputFileSystem, Path inlinksDomainIndexPath, Path inlinksDetailOutputPath) throws IOException { long recordCount = 0L; outputFileSystem.delete(inlinksDomainIndexPath); outputFileSystem.delete(inlinksDetailOutputPath); FSDataInputStream remoteInputStream = inputFileSystem.open(inlinksInputPath); try {/*w w w .ja v a2s. co m*/ FSDataOutputStream indexOutputStream = outputFileSystem.create(inlinksDomainIndexPath); FSDataOutputStream detailOutputStream = outputFileSystem.create(inlinksDetailOutputPath); ArrayList<InlinkingDomainInfo> domainList = new ArrayList<InlinkingDomainInfo>(); try { LOG.info("Writing Detail Stream to:" + inlinksDetailOutputPath); CompressedURLFPListV2.Reader reader = new CompressedURLFPListV2.Reader(remoteInputStream); InlinkingDomainInfo lastDomain = null; while (reader.hasNext()) { // read the nex fingerprint URLFPV2 fingerprint = reader.next(); // and first see if we have a domain transition if (lastDomain == null || lastDomain.getDomainId() != fingerprint.getDomainHash()) { // remember the domain lastDomain = new InlinkingDomainInfo(); lastDomain.setDomainId(fingerprint.getDomainHash()); // add it to the list domainList.add(lastDomain); // update date position lastDomain.setUrlDataPos(detailOutputStream.getPos()); } // increment url count for the domain lastDomain.setUrlCount(lastDomain.getUrlCount() + 1); detailOutputStream.writeLong(fingerprint.getDomainHash()); detailOutputStream.writeLong(fingerprint.getUrlHash()); recordCount++; } LOG.info("Retrieving Domain Metadata for :" + domainList.size() + " Domain Records"); // ok, now resolve domain names for (InlinkingDomainInfo domain : domainList) { SubDomainMetadata metadata = index.queryDomainMetadataGivenDomainId(domain.getDomainId()); if (metadata == null) { LOG.error("*** Failed to Resolve DomainId:" + domain.getDomainId()); } else { if (metadata.getDomainText().length() == 0) { LOG.error("*** Metadata for Domain Id:" + domain.getDomainId() + " contained NULL Name Value."); domain.setDomainName("_ERROR:BAD RECORD"); } else { domain.setDomainName(metadata.getDomainText()); } //LOG.info("***Found Domain:" + domain.getDomainName() + " urlCount:" + domain.getUrlCount()); } } LOG.info("Sorting Domain List of Size:" + domainList.size()); // ok sort by domain name Collections.sort(domainList); LOG.info("Building In Memory Index"); // ok write out domain info DataOutputBuffer indexHeaderBuffer = new DataOutputBuffer(); DataOutputBuffer indexDataBuffer = new DataOutputBuffer(); LOG.info("***Writing Domain List Size:" + domainList.size()); indexHeaderBuffer.writeInt(domainList.size()); // ok iterate and write to both buffers for (InlinkingDomainInfo domain : domainList) { indexHeaderBuffer.writeInt(indexDataBuffer.getLength()); domain.write(indexDataBuffer); } LOG.info("Writing Index to:" + inlinksDomainIndexPath + " IndexHeaderLength:" + indexHeaderBuffer.getLength() + " IndexDataLength:" + indexDataBuffer.getLength()); // ok now flush both buffers to disk indexOutputStream.write(indexHeaderBuffer.getData(), 0, indexHeaderBuffer.getLength()); indexOutputStream.write(indexDataBuffer.getData(), 0, indexDataBuffer.getLength()); } finally { indexOutputStream.flush(); indexOutputStream.close(); detailOutputStream.flush(); detailOutputStream.close(); } } finally { remoteInputStream.close(); } return recordCount; }
From source file:org.commoncrawl.util.CrawlLogSplitter.java
License:Open Source License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*")); for (FileStatus candidate : arcFiles) { if (candidate.getLen() > SPLIT_SIZE) { candidateList.add(candidate.getPath()); }/* ww w . j av a2 s .c o m*/ } LOG.info("Found:" + candidateList.size() + " oversized candidates"); Path tempOutputDir = new Path(conf.get("mapred.temp.dir", ".")); while (candidateList.size() != 0) { Path candidateName = candidateList.first(); candidateList.remove(candidateName); LOG.info("Processing Candidate:" + candidateName); long fileSize = fs.getFileStatus(candidateName).getLen(); //get crawl log filename components ArrayList<Path> splitItems = new ArrayList<Path>(); int index = 0; Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index); LOG.info("Initial Output Path is:" + outputPart); fs.delete(outputPart, false); // create reader SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf); ValueBytes sourceVB = reader.createValueBytes(); DataOutputBuffer sourceKeyData = new DataOutputBuffer(); try { // ok create temp file SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); // add to split items array splitItems.add(outputPart); try { long recordsWritten = 0; while (reader.nextRawKey(sourceKeyData) != -1) { reader.nextRawValue(sourceVB); long lengthPreWrite = activeWriter.getLength(); activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB); if (++recordsWritten % 10000 == 0) { LOG.info("Write 10000 records"); } long lengthPostWrite = activeWriter.getLength(); if (lengthPostWrite != lengthPreWrite) { if (lengthPostWrite >= IDEAL_SIZE) { LOG.info("Hit Split Point. Flushing File:" + outputPart); activeWriter.close(); outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), ++index); LOG.info("Creating New File:" + outputPart); activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class, CrawlURL.class, CompressionType.BLOCK, new SnappyCodec()); splitItems.add(outputPart); } } sourceKeyData.reset(); } } finally { activeWriter.close(); } } finally { reader.close(); } LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files"); for (Path splitItem : splitItems) { Path destPath = new Path("crawl/checkpoint_data", splitItem.getName()); LOG.info("Moving:" + splitItem + " to:" + destPath); fs.rename(splitItem, destPath); } Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName()); LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation); fs.rename(candidateName, sourceMoveLocation); } }
From source file:org.commoncrawl.util.GZIPUtils.java
License:Apache License
/** * Returns an gunzipped copy of the input array, truncated to * <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been * truncated or corrupted, a best-effort attempt is made to unzip as much as * possible. If no data can be extracted <code>null</code> is returned. *//*from w w w . jav a2 s. c o m*/ public static final UnzipResult unzipBestEffort(byte[] in, int offset, int sizeIn, int sizeLimit) { try { // decompress using GZIPInputStream DataOutputBuffer outStream = new DataOutputBuffer(EXPECTED_COMPRESSION_RATIO * in.length); boolean truncated = false; GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(in, offset, sizeIn)); byte[] buf = new byte[BUF_SIZE]; int written = 0; while (true) { try { int size = inStream.read(buf); if (size <= 0) break; if ((written + size) > sizeLimit) { outStream.write(buf, 0, sizeLimit - written); truncated = true; break; } outStream.write(buf, 0, size); written += size; } catch (Exception e) { break; } } try { outStream.close(); } catch (IOException e) { } return new UnzipResult(outStream.getData(), 0, outStream.getLength(), truncated); } catch (IOException e) { return null; } catch (OutOfMemoryError e) { LOG.fatal(CCStringUtils.stringifyException(e)); return null; } }