Example usage for org.apache.hadoop.io WritableName setName

List of usage examples for org.apache.hadoop.io WritableName setName

Introduction

In this page you can find the example usage for org.apache.hadoop.io WritableName setName.

Prototype

public static synchronized void setName(Class<?> writableClass, String name) 

Source Link

Document

Set the name that a class should be known as to something other than the class name.

Usage

From source file:org.commoncrawl.service.crawler.SegmentLoader.java

License:Open Source License

@SuppressWarnings("unchecked")
public static CrawlSegmentFPMap loadCrawlSegmentFPInfo(int listId, int segmentId, String crawlerName,
        CancelOperationCallback cancelCallback) throws IOException {

    CrawlSegmentFPMap fpMap = new CrawlSegmentFPMap();

    WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");

    // construct hdfs path to segment ... 
    Path hdfsPath;/*  w w w .j  a  va2 s  . c om*/
    if (segmentId != -1)
        hdfsPath = new Path(
                CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/");
    else
        hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/");

    Path workUnitDetailPath = new Path(hdfsPath, crawlerName);

    SequenceFile.Reader reader = null;

    try {
        FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
        reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig());

        LongWritable hostFP = new LongWritable();
        CrawlSegmentHost segmentHost = new CrawlSegmentHost();

        DataOutputBuffer outputBuffer = new DataOutputBuffer();

        int segmentUrlCount = 0;
        while (reader.next(hostFP, segmentHost) && cancelCallback.cancelOperation() == false) {
            // and update url count ... 
            segmentUrlCount += segmentHost.getUrlTargets().size();

            // set the url vector to the appropriate size ... 
            for (CrawlSegmentURL url : segmentHost.getUrlTargets()) {

                WritableUtils.writeVLong(outputBuffer, segmentHost.getHostFP());
                WritableUtils.writeVLong(outputBuffer, url.getUrlFP());
            }
        }
        outputBuffer.flush();
        // ok set the urlfp stream 
        fpMap.setURLFPBuffer(segmentUrlCount, outputBuffer.getData(), outputBuffer.getLength());
        // now initialize the 

        if (cancelCallback.cancelOperation()) {
            return null;
        } else {
            return fpMap;
        }
    } finally {
        if (reader != null)
            reader.close();
    }
}

From source file:org.commoncrawl.service.crawler.SegmentLoader.java

License:Open Source License

@SuppressWarnings("unchecked")
public static CrawlSegmentDetail loadCrawlSegment(int listId, int segmentId, String crawlerName,
        CrawlSegmentFPMap loadHint, DNSCache cache, LoadProgressCallback callback,
        CancelOperationCallback incomingCancelCallback) throws IOException {

    final CancelOperationCallback cancelCallback = (incomingCancelCallback != null) ? incomingCancelCallback
            : new CancelOperationCallback() {

                @Override/*from ww  w. j a  v a2 s.  c  o  m*/
                public boolean cancelOperation() {
                    return false;
                }
            };

    WritableName.setName(CrawlSegmentHost.class, "org.crawlcommons.protocol.CrawlSegmentHost");

    // construct hdfs path to segment ... 
    Path hdfsPath;
    if (segmentId != -1)
        hdfsPath = new Path(
                CrawlEnvironment.getCrawlSegmentDataDirectory() + "/" + listId + "/" + segmentId + "/");
    else
        hdfsPath = new Path(CrawlEnvironment.getCrawlSegmentDataDirectory() + "/");

    Path workUnitDetailPath = new Path(hdfsPath, crawlerName);
    SequenceFile.Reader reader = null;
    try {

        CrawlSegmentDetail segmentOut = new CrawlSegmentDetail();

        // initialize work unit detail ...
        segmentOut.setSegmentId(segmentId);

        FileSystem hdfs = CrawlEnvironment.getDefaultFileSystem();
        reader = new SequenceFile.Reader(hdfs, workUnitDetailPath, CrawlEnvironment.getHadoopConfig());

        LongWritable hostFP = new LongWritable();
        CrawlSegmentHost segmentHost = new CrawlSegmentHost();

        while (reader.next(hostFP, segmentHost) && !cancelCallback.cancelOperation()) {

            if (segmentHost.getHostFP() == 0) {
                LOG.error("Host FP is Zero during reader.next");
            }

            //setup the segment id associated with this host (so that the host contains self sufficient context information).
            segmentHost.setSegmentId(segmentId);
            segmentHost.setListId(listId);

            // capture original item count 
            int originalURLCount = segmentHost.getUrlTargets().size();
            int completedURLCount = 0;

            // and update url count ... 
            segmentOut.setUrlCount(segmentOut.getUrlCount() + segmentHost.getUrlTargets().size());

            if (loadHint != null) {
                // now walk remaining items (in hint) 
                for (int i = 0; i < segmentHost.getUrlTargets().size(); ++i) {

                    CrawlSegmentURL segmentURL = segmentHost.getUrlTargets().get(i);

                    URLFPV2 urlfp = new URLFPV2();

                    urlfp.setDomainHash(segmentHost.getHostFP());
                    urlfp.setUrlHash(segmentURL.getUrlFP());

                    if (loadHint.wasCrawled(urlfp)) {
                        completedURLCount++;
                        segmentHost.getUrlTargets().remove(i);
                        --i;
                        segmentOut.setUrlsComplete(segmentOut.getUrlsComplete() + 1);
                    }
                }
            }
            // now ... if there are no more entries in the host ...  
            if (segmentHost.getUrlTargets().size() != 0) {

                if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_IPADDRESS)) {
                    if (cache != null) {
                        // try to resolve the address up front 
                        DNSResult dnsCacheResult = cache.resolveName(segmentHost);

                        if (dnsCacheResult != null) {
                            segmentHost.setIpAddress(dnsCacheResult.ipAddress);
                            segmentHost.setTtl(dnsCacheResult.ttl);
                            if (dnsCacheResult.cname != null && dnsCacheResult.cname.length() != 0) {
                                segmentHost.setCname(dnsCacheResult.cname);
                            }
                        }
                    }
                } else {
                    if (!segmentHost.isFieldDirty(CrawlSegmentHost.Field_TTL)) {
                        segmentHost.setTtl(0);
                    }
                }
            }

            // if a progress callback was specified, then call it with the load progress of this host ... 
            if (callback != null) {
                // and initiate completion callaback
                boolean continueLoading = callback.hostAvailable(segmentHost, originalURLCount,
                        completedURLCount);

                if (!continueLoading) {
                    LOG.info("HostAvailable Callback returned false. Aborting Load");
                    return null;
                }

            }
            // otherwise ... add the host to the segment detail ... 
            else {
                segmentOut.getHosts().add(segmentHost);
            }

            // and allocate a new segment host for next read 
            segmentHost = new CrawlSegmentHost();
        }

        if (!cancelCallback.cancelOperation()) {
            return segmentOut;
        } else {
            return null;
        }
    } finally {
        if (reader != null)
            reader.close();
    }
}

From source file:org.commoncrawl.util.ArcFileWriter.java

License:Open Source License

@Test
public void testArcFileWriter() throws Exception {

    Path crawlFilePath = new Path("crawl/checkpoint_data/CrawlLog_cc08_1210918849380");

    WritableName.setName(CrawlURL.class, "org.crawlcommons.protocol.CrawlURL");

    SequenceFile.Reader reader = new SequenceFile.Reader(_fileSystem, crawlFilePath,
            CrawlEnvironment.getHadoopConfig());

    Text url = new Text();
    CrawlURL urlData = new CrawlURL();

    while (reader.next(url, urlData)) {

        NIOHttpHeaders headers = CrawlURLHelper.getHeadersFromCrawlURL(urlData);
        write(url.toString(), 1, 1, urlData, headers, "text/html", "test");
    }//from   ww  w. ja v a 2  s.c  o m

    reader.close();
    this.close(false);
}