Example usage for org.apache.hadoop.io WritableName setName

Introduction

In this page you can find the example usage for org.apache.hadoop.io WritableName setName.

Prototype

public static synchronized void setName(Class<?> writableClass, String name)

Source Link

Document

Set the name that a class should be known as to something other than the class name.

Usage

From source file:org.commoncrawl.service.crawler.SegmentLoader.java

License:Open Source License

@SuppressWarnings("unchecked")
public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId, int segmentId, String crawlerName,
        CancelOperationCallback cancelCallback) throws IOException {

    CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap();

    WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");

    // construct hdfs path to segment ... 
    Path hdfsPath;/*  w w w .j  a  va2 s  . c om*/
    if (segmentId != -1)
        hdfsPath = new Path(
                CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/");
    else
        hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/");

    Path workUnitDetailPath = new Path(hdfsPath, crawlerName);

    SequenceFile.Reader reader = null;

    try {
        FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
        reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig());

        LongWritable hostFP = new LongWritable();
        CrawlSegmentHost segmentHost = new CrawlSegmentHost();

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        int segmentUrlCount = 0;
        while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) {
            // and update url count ... 
            segmentUrlCount += segmentHost.getUrlTargets().size();

            // set the url vector to the appropriate size ... 
            for (CrawlSegmentURL url : segmentHost.getUrlTargets()) {

                WritableUtils.writeVLong(outputBuffer, segmentHost.getHostFP());
                WritableUtils.writeVLong(outputBuffer, url.getUrlFP());
            }
        }
        outputBuffer.flush();
        // ok set the urlfp stream 
        fpMap.setURLFPBuffer(segmentUrlCount, outputBuffer.getData(), outputBuffer.getLength());
        // now initialize the 

        if (cancelCallback.cancelOperation()) {
            return null;
        } else {
            return fpMap;
        }
    } finally {
        if (reader != null)
            reader.close();
    }
}

From source file:org.commoncrawl.service.crawler.SegmentLoader.java

License:Open Source License

@SuppressWarnings("unchecked")
public static CrawlSegmentDetail loadCrawlSegment(int listId, int segmentId, String crawlerName,
        CrawlSegmentFPMap loadHint, DNSCache cache, LoadProgressCallback callback,
        CancelOperationCallback incomingCancelCallback) throws IOException {

    final CancelOperationCallback cancelCallback = (incomingCancelCallback != null) ? incomingCancelCallback
            : new CancelOperationCallback() {

                @Override/*from ww  w. j a  v a2 s.  c  o  m*/
                public boolean cancelOperation() {
                    return false;
                }
            };

    WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");

    // construct hdfs path to segment ... 
    Path hdfsPath;
    if (segmentId != -1)
        hdfsPath = new Path(
                CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/");
    else
        hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/");

    Path workUnitDetailPath = new Path(hdfsPath, crawlerName);
    SequenceFile.Reader reader = null;
    try {

        CrawlSegmentDetail segmentOut = new CrawlSegmentDetail();

        // initialize work unit detail ...
        segmentOut.setSegmentId(segmentId);

        FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
        reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig());

        LongWritable hostFP = new LongWritable();
        CrawlSegmentHost segmentHost = new CrawlSegmentHost();

        while (reader.next(hostFP, segmentHost) && !cancelCallback.cancelOperation()) {

            if (segmentHost.getHostFP() == 0) {
                LOG.error("Host FP is Zero during reader.next");
            }

            //setup the segment id associated with this host (so that the host contains self sufficient context information).
            segmentHost.setSegmentId(segmentId);
            segmentHost.setListId(listId);

            // capture original item count 
            int originalURLCount = segmentHost.getUrlTargets().size();
            int completedURLCount = 0;

            // and update url count ... 
            segmentOut.setUrlCount(segmentOut.getUrlCount() + segmentHost.getUrlTargets().size());

            if (loadHint != null) {
                // now walk remaining items (in hint) 
                for (int i = 0; i < segmentHost.getUrlTargets().size(); ++i) {

                    CrawlSegmentURL segmentURL = segmentHost.getUrlTargets().get(i);

                    URLFPV2 urlfp = new URLFPV2();

                    urlfp.setDomainHash(segmentHost.getHostFP());
                    urlfp.setUrlHash(segmentURL.getUrlFP());

                    if (loadHint.wasCrawled(urlfp)) {
                        completedURLCount++;
                        segmentHost.getUrlTargets().remove(i);
                        --i;
                        segmentOut.setUrlsComplete(segmentOut.getUrlsComplete() + 1);
                    }
                }
            }
            // now ... if there are no more entries in the host ...  
            if (segmentHost.getUrlTargets().size() != 0) {

                if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_IPADDRESS)) {
                    if (cache != null) {
                        // try to resolve the address up front 
                        DNSResult dnsCacheResult = cache.resolveName(segmentHost);

                        if (dnsCacheResult != null) {
                            segmentHost.setIpAddress(dnsCacheResult.ipAddress);
                            segmentHost.setTtl(dnsCacheResult.ttl);
                            if (dnsCacheResult.cname != null && dnsCacheResult.cname.length() != 0) {
                                segmentHost.setCname(dnsCacheResult.cname);
                            }
                        }
                    }
                } else {
                    if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_TTL)) {
                        segmentHost.setTtl(0);
                    }
                }
            }

            // if a progress callback was specified, then call it with the load progress of this host ... 
            if (callback != null) {
                // and initiate completion callaback
                boolean continueLoading = callback.hostAvailable(segmentHost, originalURLCount,
                        completedURLCount);

                if (!continueLoading) {
                    LOG.info("HostAvailable Callback returned false. Aborting Load");
                    return null;
                }

            }
            // otherwise ... add the host to the segment detail ... 
            else {
                segmentOut.getHosts().add(segmentHost);
            }

            // and allocate a new segment host for next read 
            segmentHost = new CrawlSegmentHost();
        }

        if (!cancelCallback.cancelOperation()) {
            return segmentOut;
        } else {
            return null;
        }
    } finally {
        if (reader != null)
            reader.close();
    }
}

From source file:org.commoncrawl.util.ArcFileWriter.java

License:Open Source License

@Test
public void testArcFileWriter() throws Exception {

    Path crawlFilePath = new Path("crawl/checkpoint_data/CrawlLog_cc08_1210918849380");

    WritableName.setName(CrawlURL.class, "org.crawlcommons.protocol.CrawlURL");

    SequenceFile.Reader reader = new SequenceFile.Reader(_fileSystem, crawlFilePath,
            CrawlEnvironment.getHadoopConfig());

    Text url = new Text();
    CrawlURL urlData = new CrawlURL();

    while (reader.next(url, urlData)) {

        NIOHttpHeaders headers = CrawlURLHelper.getHeadersFromCrawlURL(urlData);
        write(url.toString(), 1, 1, urlData, headers, "text/html", "test");
    }//from   ww  w. ja v a 2  s.c  o m

    reader.close();
    this.close(false);
}