org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.NonSuperSubdomainCollectorStep.java Source code

Introduction

Here is the source code for org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.NonSuperSubdomainCollectorStep.java
Source

/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;

import com.google.gson.JsonArray;
import com.google.gson.JsonPrimitive;

/**
 * 
 * @author rana
 *
 */
public class NonSuperSubdomainCollectorStep extends CrawlPipelineStep implements
        Mapper<TextBytes, TextBytes, TextBytes, TextBytes>, Reducer<TextBytes, TextBytes, TextBytes, TextBytes> {

    enum Counters {
        HIT_MAXSUBDOMAIN_LIMIT, SKIPPED_SUBDOMAIN_SAME_AS_ROOT_VIA_ID, SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PREFIX, SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PATTERN_MATCH
    }

    private static final Log LOG = LogFactory.getLog(NonSuperSubdomainCollectorStep.class);

    static final int NUM_HASH_FUNCTIONS = 10;

    static final int NUM_BITS = 11;
    static final int NUM_ELEMENTS = 1 << 29;
    static final int FLUSH_THRESHOLD = 1 << 23;
    public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list";

    URLFPBloomFilter subDomainFilter;

    public static final String OUTPUT_DIR_NAME = "nonsuper-subdomains";

    URLFPV2 bloomKey = new URLFPV2();

    TextBytes emptyTextBytes = new TextBytes();

    Pattern wwwMatchPattern = Pattern.compile("www[\\-0-9]*\\.");

    Set<Long> superDomainIdSet;
    HashSet<String> domains = new HashSet<String>();

    static final int MAX_SUBDOMAINS_ALLOWED = 100;

    public NonSuperSubdomainCollectorStep() {
        super(null, null, null);
    }

    public NonSuperSubdomainCollectorStep(CrawlPipelineTask task) {
        super(task, "SubDomain Collector", OUTPUT_DIR_NAME);
    }

    @Override
    public void close() throws IOException {

    }

    @Override
    public void configure(JobConf job) {

        if (job.getBoolean("mapred.task.is.map", false)) {
            Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));

            try {
                superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
            } catch (IOException e) {
                LOG.error(StringUtils.stringifyException(e));
                throw new RuntimeException(e);
            }

            subDomainFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
        }
    }

    @Override
    public Log getLogger() {
        return LOG;
    }

    @Override
    public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
            throws IOException {
        String url = key.toString();
        GoogleURL urlObject = new GoogleURL(url);

        if (urlObject.isValid()) {

            String rootDomain = URLUtils.extractRootDomainName(urlObject.getHost());

            if (rootDomain != null) {
                long rootDomainId = SuperDomainList.domainFingerprintGivenName(rootDomain);

                if (!superDomainIdSet.contains(rootDomainId)) {

                    long subDomainId = SuperDomainList.domainFingerprintGivenName(urlObject.getHost());

                    if (subDomainId == rootDomainId) {
                        reporter.incrCounter(Counters.SKIPPED_SUBDOMAIN_SAME_AS_ROOT_VIA_ID, 1);
                        return;
                    }

                    // extract prefix ...
                    String prefix = urlObject.getHost().substring(0,
                            urlObject.getHost().length() - rootDomain.length());

                    // straight match ...
                    if (prefix.equals("www.")) {
                        reporter.incrCounter(Counters.SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PREFIX, 1);
                        return; // skip
                    } else if (prefix.startsWith("www") && wwwMatchPattern.matcher(prefix).matches()) {
                        reporter.incrCounter(Counters.SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PATTERN_MATCH, 1);
                        return;
                    }

                    bloomKey.setDomainHash(subDomainId);
                    bloomKey.setUrlHash(subDomainId);

                    if (subDomainFilter.isPresent(bloomKey)) {
                        // hacky but ,oh well, pressed for time
                        return;
                    }

                    // add it to the BF NOW
                    subDomainFilter.add(bloomKey);

                    // emit as root domain , sub domain
                    output.collect(new TextBytes(rootDomain), new TextBytes(urlObject.getHost()));
                }
            }
        }
    }

    @Override
    public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output,
            Reporter reporter) throws IOException {
        while (values.hasNext()) {
            domains.add(values.next().toString());
            if (domains.size() >= MAX_SUBDOMAINS_ALLOWED) {
                reporter.incrCounter(Counters.HIT_MAXSUBDOMAIN_LIMIT, 1);
                break;
            }
        }

        if (domains.size() != 0 && domains.size() < MAX_SUBDOMAINS_ALLOWED) {
            JsonArray array = new JsonArray();
            for (String domain : domains) {
                array.add(new JsonPrimitive(domain));
            }
            output.collect(key, new TextBytes(array.toString()));
        }
        domains.clear();
    }

    @Override
    public void runStep(Path outputPathLocation) throws IOException {

        DomainMetadataTask rootTask = findTaskOfType(DomainMetadataTask.class);
        Path superDomainListPath = new Path(getOutputDirForStep(GenSuperDomainListStep.class), "part-00000");

        JobConf job = new JobBuilder(getDescription(), getConf()).inputs(rootTask.getRestrictedMergeDBDataPaths())
                .inputIsSeqFile().mapper(NonSuperSubdomainCollectorStep.class)
                .reducer(NonSuperSubdomainCollectorStep.class, false)
                .numReducers(CrawlEnvironment.NUM_DB_SHARDS / 2).keyValue(TextBytes.class, TextBytes.class)
                .output(outputPathLocation).outputIsSeqFile()
                .set(SUPER_DOMAIN_FILE_PATH, superDomainListPath.toString()).reuseJVM(1000).build();

        JobClient.runJob(job);
    }

}