List of usage examples for org.apache.hadoop.io WritableName setName
public static synchronized void setName(Class<?> writableClass, String name)
From source file:org.commoncrawl.service.crawler.SegmentLoader.java
License:Open Source License
@SuppressWarnings("unchecked") public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId, int segmentId, String crawlerName, CancelOperationCallback cancelCallback) throws IOException { CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap(); WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost"); // construct hdfs path to segment ... Path hdfsPath;/* w w w .j a va2 s . c om*/ if (segmentId != -1) hdfsPath = new Path( CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/"); else hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"); Path workUnitDetailPath = new Path(hdfsPath, crawlerName); SequenceFile.Reader reader = null; try { FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem(); reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig()); LongWritable hostFP = new LongWritable(); CrawlSegmentHost segmentHost = new CrawlSegmentHost(); DataOutputBuffer outputBuffer = new DataOutputBuffer(); int segmentUrlCount = 0; while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) { // and update url count ... segmentUrlCount += segmentHost.getUrlTargets().size(); // set the url vector to the appropriate size ... for (CrawlSegmentURL url : segmentHost.getUrlTargets()) { WritableUtils.writeVLong(outputBuffer, segmentHost.getHostFP()); WritableUtils.writeVLong(outputBuffer, url.getUrlFP()); } } outputBuffer.flush(); // ok set the urlfp stream fpMap.setURLFPBuffer(segmentUrlCount, outputBuffer.getData(), outputBuffer.getLength()); // now initialize the if (cancelCallback.cancelOperation()) { return null; } else { return fpMap; } } finally { if (reader != null) reader.close(); } }
From source file:org.commoncrawl.service.crawler.SegmentLoader.java
License:Open Source License
@SuppressWarnings("unchecked") public static CrawlSegmentDetail loadCrawlSegment(int listId, int segmentId, String crawlerName, CrawlSegmentFPMap loadHint, DNSCache cache, LoadProgressCallback callback, CancelOperationCallback incomingCancelCallback) throws IOException { final CancelOperationCallback cancelCallback = (incomingCancelCallback != null) ? incomingCancelCallback : new CancelOperationCallback() { @Override/*from ww w. j a v a2 s. c o m*/ public boolean cancelOperation() { return false; } }; WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost"); // construct hdfs path to segment ... Path hdfsPath; if (segmentId != -1) hdfsPath = new Path( CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/"); else hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/"); Path workUnitDetailPath = new Path(hdfsPath, crawlerName); SequenceFile.Reader reader = null; try { CrawlSegmentDetail segmentOut = new CrawlSegmentDetail(); // initialize work unit detail ... segmentOut.setSegmentId(segmentId); FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem(); reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig()); LongWritable hostFP = new LongWritable(); CrawlSegmentHost segmentHost = new CrawlSegmentHost(); while (reader.next(hostFP, segmentHost) && !cancelCallback.cancelOperation()) { if (segmentHost.getHostFP() == 0) { LOG.error("Host FP is Zero during reader.next"); } //setup the segment id associated with this host (so that the host contains self sufficient context information). segmentHost.setSegmentId(segmentId); segmentHost.setListId(listId); // capture original item count int originalURLCount = segmentHost.getUrlTargets().size(); int completedURLCount = 0; // and update url count ... segmentOut.setUrlCount(segmentOut.getUrlCount() + segmentHost.getUrlTargets().size()); if (loadHint != null) { // now walk remaining items (in hint) for (int i = 0; i < segmentHost.getUrlTargets().size(); ++i) { CrawlSegmentURL segmentURL = segmentHost.getUrlTargets().get(i); URLFPV2 urlfp = new URLFPV2(); urlfp.setDomainHash(segmentHost.getHostFP()); urlfp.setUrlHash(segmentURL.getUrlFP()); if (loadHint.wasCrawled(urlfp)) { completedURLCount++; segmentHost.getUrlTargets().remove(i); --i; segmentOut.setUrlsComplete(segmentOut.getUrlsComplete() + 1); } } } // now ... if there are no more entries in the host ... if (segmentHost.getUrlTargets().size() != 0) { if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_IPADDRESS)) { if (cache != null) { // try to resolve the address up front DNSResult dnsCacheResult = cache.resolveName(segmentHost); if (dnsCacheResult != null) { segmentHost.setIpAddress(dnsCacheResult.ipAddress); segmentHost.setTtl(dnsCacheResult.ttl); if (dnsCacheResult.cname != null && dnsCacheResult.cname.length() != 0) { segmentHost.setCname(dnsCacheResult.cname); } } } } else { if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_TTL)) { segmentHost.setTtl(0); } } } // if a progress callback was specified, then call it with the load progress of this host ... if (callback != null) { // and initiate completion callaback boolean continueLoading = callback.hostAvailable(segmentHost, originalURLCount, completedURLCount); if (!continueLoading) { LOG.info("HostAvailable Callback returned false. Aborting Load"); return null; } } // otherwise ... add the host to the segment detail ... else { segmentOut.getHosts().add(segmentHost); } // and allocate a new segment host for next read segmentHost = new CrawlSegmentHost(); } if (!cancelCallback.cancelOperation()) { return segmentOut; } else { return null; } } finally { if (reader != null) reader.close(); } }
From source file:org.commoncrawl.util.ArcFileWriter.java
License:Open Source License
@Test public void testArcFileWriter() throws Exception { Path crawlFilePath = new Path("crawl/checkpoint_data/CrawlLog_cc08_1210918849380"); WritableName.setName(CrawlURL.class, "org.crawlcommons.protocol.CrawlURL"); SequenceFile.Reader reader = new SequenceFile.Reader(_fileSystem, crawlFilePath, CrawlEnvironment.getHadoopConfig()); Text url = new Text(); CrawlURL urlData = new CrawlURL(); while (reader.next(url, urlData)) { NIOHttpHeaders headers = CrawlURLHelper.getHeadersFromCrawlURL(urlData); write(url.toString(), 1, 1, urlData, headers, "text/html", "test"); }//from ww w. ja v a 2 s.c o m reader.close(); this.close(false); }