org.apache.nutch.scoring.webgraph.LinkRank.java Source code

Introduction

Here is the source code for org.apache.nutch.scoring.webgraph.LinkRank.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.scoring.webgraph;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;

public class LinkRank extends Configured implements Tool {

    public static final Logger LOG = LoggerFactory.getLogger(LinkRank.class);
    private static final String NUM_NODES = "_num_nodes_";

    /**
     * Runs the counter job. The counter job determines the number of links in the
     * webgraph. This is used during analysis.
     * 
     * @param fs The job file system.
     * @param webGraphDb The web graph database to use.
     * 
     * @return The number of nodes in the web graph.
     * @throws IOException If an error occurs while running the counter job.
     */
    private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

        // configure the counter job
        Path numLinksPath = new Path(webGraphDb, NUM_NODES);
        Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
        JobConf counter = new NutchJob(getConf());
        counter.setJobName("LinkRank Counter");
        FileInputFormat.addInputPath(counter, nodeDb);
        FileOutputFormat.setOutputPath(counter, numLinksPath);
        counter.setInputFormat(SequenceFileInputFormat.class);
        counter.setMapperClass(Counter.class);
        counter.setCombinerClass(Counter.class);
        counter.setReducerClass(Counter.class);
        counter.setMapOutputKeyClass(Text.class);
        counter.setMapOutputValueClass(LongWritable.class);
        counter.setOutputKeyClass(Text.class);
        counter.setOutputValueClass(LongWritable.class);
        counter.setNumReduceTasks(1);
        counter.setOutputFormat(TextOutputFormat.class);
        counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

        // run the counter job, outputs to a single reduce task and file
        LOG.info("Starting link counter job");
        try {
            JobClient.runJob(counter);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished link counter job");

        // read the first (and only) line from the file which should be the
        // number of links in the web graph
        LOG.info("Reading numlinks temp file");
        FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
        BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
        String numLinksLine = buffer.readLine();
        readLinks.close();

        // check if there are links to process, if none, webgraph might be empty
        if (numLinksLine == null || numLinksLine.length() == 0) {
            fs.delete(numLinksPath, true);
            throw new IOException("No links to process, is the webgraph empty?");
        }

        // delete temp file and convert and return the number of links as an int
        LOG.info("Deleting numlinks temp file");
        fs.delete(numLinksPath, true);
        String numLinks = numLinksLine.split("\\s+")[1];
        return Integer.parseInt(numLinks);
    }

    /**
     * Runs the initializer job. The initializer job sets up the nodes with a
     * default starting score for link analysis.
     * 
     * @param nodeDb The node database to use.
     * @param output The job output directory.
     * 
     * @throws IOException If an error occurs while running the initializer job.
     */
    private void runInitializer(Path nodeDb, Path output) throws IOException {

        // configure the initializer
        JobConf initializer = new NutchJob(getConf());
        initializer.setJobName("LinkAnalysis Initializer");
        FileInputFormat.addInputPath(initializer, nodeDb);
        FileOutputFormat.setOutputPath(initializer, output);
        initializer.setInputFormat(SequenceFileInputFormat.class);
        initializer.setMapperClass(Initializer.class);
        initializer.setMapOutputKeyClass(Text.class);
        initializer.setMapOutputValueClass(Node.class);
        initializer.setOutputKeyClass(Text.class);
        initializer.setOutputValueClass(Node.class);
        initializer.setOutputFormat(MapFileOutputFormat.class);
        initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

        // run the initializer
        LOG.info("Starting initialization job");
        try {
            JobClient.runJob(initializer);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished initialization job.");
    }

    /**
     * Runs the inverter job. The inverter job flips outlinks to inlinks to be
     * passed into the analysis job.
     * 
     * The inverter job takes a link loops database if it exists. It is an
     * optional componenet of link analysis due to its extreme computational and
     * space requirements but it can be very useful is weeding out and eliminating
     * link farms and other spam pages.
     * 
     * @param nodeDb The node database to use.
     * @param outlinkDb The outlink database to use.
     * @param loopDb The loop database to use if it exists.
     * @param output The output directory.
     * 
     * @throws IOException If an error occurs while running the inverter job.
     */
    private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException {

        // configure the inverter
        JobConf inverter = new NutchJob(getConf());
        inverter.setJobName("LinkAnalysis Inverter");
        FileInputFormat.addInputPath(inverter, nodeDb);
        FileInputFormat.addInputPath(inverter, outlinkDb);

        // add the loop database if it exists, isn't null
        if (loopDb != null) {
            FileInputFormat.addInputPath(inverter, loopDb);
        }
        FileOutputFormat.setOutputPath(inverter, output);
        inverter.setInputFormat(SequenceFileInputFormat.class);
        inverter.setMapperClass(Inverter.class);
        inverter.setReducerClass(Inverter.class);
        inverter.setMapOutputKeyClass(Text.class);
        inverter.setMapOutputValueClass(ObjectWritable.class);
        inverter.setOutputKeyClass(Text.class);
        inverter.setOutputValueClass(LinkDatum.class);
        inverter.setOutputFormat(SequenceFileOutputFormat.class);
        inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

        // run the inverter job
        LOG.info("Starting inverter job");
        try {
            JobClient.runJob(inverter);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished inverter job.");
    }

    /**
     * Runs the link analysis job. The link analysis job applies the link rank
     * formula to create a score per url and stores that score in the NodeDb.
     * 
     * Typically the link analysis job is run a number of times to allow the link
     * rank scores to converge.
     * 
     * @param nodeDb The node database from which we are getting previous link
     * rank scores.
     * @param inverted The inverted inlinks
     * @param output The link analysis output.
     * @param iteration The current iteration number.
     * @param numIterations The total number of link analysis iterations
     * 
     * @throws IOException If an error occurs during link analysis.
     */
    private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations,
            float rankOne) throws IOException {

        JobConf analyzer = new NutchJob(getConf());
        analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
        analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations);
        FileInputFormat.addInputPath(analyzer, nodeDb);
        FileInputFormat.addInputPath(analyzer, inverted);
        FileOutputFormat.setOutputPath(analyzer, output);
        analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
        analyzer.setMapOutputKeyClass(Text.class);
        analyzer.setMapOutputValueClass(ObjectWritable.class);
        analyzer.setInputFormat(SequenceFileInputFormat.class);
        analyzer.setMapperClass(Analyzer.class);
        analyzer.setReducerClass(Analyzer.class);
        analyzer.setOutputKeyClass(Text.class);
        analyzer.setOutputValueClass(Node.class);
        analyzer.setOutputFormat(MapFileOutputFormat.class);
        analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

        LOG.info("Starting analysis job");
        try {
            JobClient.runJob(analyzer);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished analysis job.");
    }

    /**
     * The Counter job that determines the total number of nodes in the WebGraph.
     * This is used to determine a rank one score for pages with zero inlinks but
     * that contain outlinks.
     */
    private static class Counter
            implements Mapper<Text, Node, Text, LongWritable>, Reducer<Text, LongWritable, Text, LongWritable> {

        private JobConf conf;
        private static Text numNodes = new Text(NUM_NODES);
        private static LongWritable one = new LongWritable(1L);

        public void configure(JobConf conf) {
            this.conf = conf;
        }

        /**
         * Outputs one for every node.
         */
        public void map(Text key, Node value, OutputCollector<Text, LongWritable> output, Reporter reporter)
                throws IOException {
            output.collect(numNodes, one);
        }

        /**
         * Totals the node number and outputs a single total value.
         */
        public void reduce(Text key, Iterator<LongWritable> values, OutputCollector<Text, LongWritable> output,
                Reporter reporter) throws IOException {

            long total = 0;
            while (values.hasNext()) {
                total += values.next().get();
            }
            output.collect(numNodes, new LongWritable(total));
        }

        public void close() {
        }
    }

    private static class Initializer implements Mapper<Text, Node, Text, Node> {

        private JobConf conf;
        private float initialScore = 1.0f;

        public void configure(JobConf conf) {
            this.conf = conf;
            initialScore = conf.getFloat("link.analyze.initial.score", 1.0f);
        }

        public void map(Text key, Node node, OutputCollector<Text, Node> output, Reporter reporter)
                throws IOException {

            String url = key.toString();
            Node outNode = (Node) WritableUtils.clone(node, conf);
            outNode.setInlinkScore(initialScore);

            output.collect(new Text(url), outNode);
        }

        public void close() {
        }
    }

    /**
     * Inverts outlinks and attaches current score from the NodeDb of the
     * WebGraph. The link analysis process consists of inverting, analyzing and
     * scoring, in a loop for a given number of iterations.
     */
    private static class Inverter implements Mapper<Text, Writable, Text, ObjectWritable>,
            Reducer<Text, ObjectWritable, Text, LinkDatum> {

        private JobConf conf;

        public void configure(JobConf conf) {
            this.conf = conf;
        }

        /**
         * Convert values to ObjectWritable
         */
        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            objWrite.set(value);
            output.collect(key, objWrite);
        }

        /**
         * Inverts outlinks to inlinks, attaches current score for the outlink from
         * the NodeDb of the WebGraph and removes any outlink that is contained
         * within the loopset.
         */
        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, LinkDatum> output,
                Reporter reporter) throws IOException {

            String fromUrl = key.toString();
            List<LinkDatum> outlinks = new ArrayList<LinkDatum>();
            Node node = null;
            LoopSet loops = null;

            // aggregate outlinks, assign other values
            while (values.hasNext()) {
                ObjectWritable write = values.next();
                Object obj = write.get();
                if (obj instanceof Node) {
                    node = (Node) obj;
                } else if (obj instanceof LinkDatum) {
                    outlinks.add((LinkDatum) WritableUtils.clone((LinkDatum) obj, conf));
                } else if (obj instanceof LoopSet) {
                    loops = (LoopSet) obj;
                }
            }

            // Check for the possibility of a LoopSet object without Node and LinkDatum objects. This can happen
            // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL filters or normalizers) but
            // without an updated Loops database.
            // See: https://issues.apache.org/jira/browse/NUTCH-1299
            if (node == null && loops != null) {
                // Nothing to do
                LOG.warn("LoopSet without Node object received for " + key.toString()
                        + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph.");
                return;
            }

            // get the number of outlinks and the current inlink and outlink scores
            // from the node of the url
            int numOutlinks = node.getNumOutlinks();
            float inlinkScore = node.getInlinkScore();
            float outlinkScore = node.getOutlinkScore();
            LOG.debug(fromUrl + ": num outlinks " + numOutlinks);

            // can't invert if no outlinks
            if (numOutlinks > 0) {

                Set<String> loopSet = (loops != null) ? loops.getLoopSet() : null;
                for (int i = 0; i < outlinks.size(); i++) {
                    LinkDatum outlink = outlinks.get(i);
                    String toUrl = outlink.getUrl();

                    // remove any url that is contained in the loopset
                    if (loopSet != null && loopSet.contains(toUrl)) {
                        LOG.debug(fromUrl + ": Skipping inverting inlink from loop " + toUrl);
                        continue;
                    }
                    outlink.setUrl(fromUrl);
                    outlink.setScore(outlinkScore);

                    // collect the inverted outlink
                    output.collect(new Text(toUrl), outlink);
                    LOG.debug(toUrl + ": inverting inlink from " + fromUrl + " origscore: " + inlinkScore
                            + " numOutlinks: " + numOutlinks + " inlinkscore: " + outlinkScore);
                }
            }
        }

        public void close() {
        }
    }

    /**
     * Runs a single link analysis iteration.
     */
    private static class Analyzer
            implements Mapper<Text, Writable, Text, ObjectWritable>, Reducer<Text, ObjectWritable, Text, Node> {

        private JobConf conf;
        private float dampingFactor = 0.85f;
        private float rankOne = 0.0f;
        private int itNum = 0;
        private boolean limitPages = true;
        private boolean limitDomains = true;

        /**
         * Configures the job, sets the damping factor, rank one score, and other
         * needed values for analysis.
         */
        public void configure(JobConf conf) {

            try {
                this.conf = conf;
                this.dampingFactor = conf.getFloat("link.analyze.damping.factor", 0.85f);
                this.rankOne = conf.getFloat("link.analyze.rank.one", 0.0f);
                this.itNum = conf.getInt("link.analyze.iteration", 0);
                limitPages = conf.getBoolean("link.ignore.limit.page", true);
                limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
            } catch (Exception e) {
                LOG.error(StringUtils.stringifyException(e));
                throw new IllegalArgumentException(e);
            }
        }

        /**
         * Convert values to ObjectWritable
         */
        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            objWrite.set(WritableUtils.clone(value, conf));
            output.collect(key, objWrite);
        }

        /**
         * Performs a single iteration of link analysis. The resulting scores are
         * stored in a temporary NodeDb which replaces the NodeDb of the WebGraph.
         */
        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, Node> output,
                Reporter reporter) throws IOException {

            String url = key.toString();
            Set<String> domains = new HashSet<String>();
            Set<String> pages = new HashSet<String>();
            Node node = null;

            // a page with zero inlinks has a score of rankOne
            int numInlinks = 0;
            float totalInlinkScore = rankOne;

            while (values.hasNext()) {

                ObjectWritable next = values.next();
                Object value = next.get();
                if (value instanceof Node) {
                    node = (Node) value;
                } else if (value instanceof LinkDatum) {

                    LinkDatum linkDatum = (LinkDatum) value;
                    float scoreFromInlink = linkDatum.getScore();
                    String inlinkUrl = linkDatum.getUrl();
                    String inLinkDomain = URLUtil.getDomainName(inlinkUrl);
                    String inLinkPage = URLUtil.getPage(inlinkUrl);

                    // limit counting duplicate inlinks by pages or domains
                    if ((limitPages && pages.contains(inLinkPage))
                            || (limitDomains && domains.contains(inLinkDomain))) {
                        LOG.debug(url + ": ignoring " + scoreFromInlink + " from " + inlinkUrl
                                + ", duplicate page or domain");
                        continue;
                    }

                    // aggregate total inlink score
                    numInlinks++;
                    totalInlinkScore += scoreFromInlink;
                    domains.add(inLinkDomain);
                    pages.add(inLinkPage);
                    LOG.debug(url + ": adding " + scoreFromInlink + " from " + inlinkUrl + ", total: "
                            + totalInlinkScore);
                }
            }

            // calculate linkRank score formula
            float linkRankScore = (1 - this.dampingFactor) + (this.dampingFactor * totalInlinkScore);

            LOG.debug(url + ": score: " + linkRankScore + " num inlinks: " + numInlinks + " iteration: " + itNum);

            // store the score in a temporary NodeDb
            Node outNode = (Node) WritableUtils.clone(node, conf);
            outNode.setInlinkScore(linkRankScore);
            output.collect(key, outNode);
        }

        public void close() throws IOException {
        }
    }

    /**
     * Default constructor.
     */
    public LinkRank() {
        super();
    }

    /**
     * Configurable constructor.
     */
    public LinkRank(Configuration conf) {
        super(conf);
    }

    public void close() {
    }

    /**
     * Runs the complete link analysis job. The complete job determins rank one
     * score. Then runs through a given number of invert and analyze iterations,
     * by default 10. And finally replaces the NodeDb in the WebGraph with the
     * link rank output.
     * 
     * @param webGraphDb The WebGraph to run link analysis on.
     * 
     * @throws IOException If an error occurs during link analysis.
     */
    public void analyze(Path webGraphDb) throws IOException {

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("Analysis: starting at " + sdf.format(start));

        // store the link rank under the webgraphdb temporarily, final scores get
        // upddated into the nodedb
        Path linkRank = new Path(webGraphDb, "linkrank");
        Configuration conf = getConf();
        FileSystem fs = FileSystem.get(conf);

        // create the linkrank directory if needed
        if (!fs.exists(linkRank)) {
            fs.mkdirs(linkRank);
        }

        // the webgraph outlink and node database paths
        Path wgOutlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
        Path wgNodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
        Path nodeDb = new Path(linkRank, WebGraph.NODE_DIR);
        Path loopDb = new Path(webGraphDb, Loops.LOOPS_DIR);
        if (!fs.exists(loopDb)) {
            loopDb = null;
        }

        // get the number of total nodes in the webgraph, used for rank one, then
        // initialze all urls with a default score
        int numLinks = runCounter(fs, webGraphDb);
        runInitializer(wgNodeDb, nodeDb);
        float rankOneScore = (1f / (float) numLinks);

        if (LOG.isInfoEnabled()) {
            LOG.info("Analysis: Number of links: " + numLinks);
            LOG.info("Analysis: Rank One: " + rankOneScore);
        }

        // run invert and analysis for a given number of iterations to allow the
        // link rank scores to converge
        int numIterations = conf.getInt("link.analyze.num.iterations", 10);
        for (int i = 0; i < numIterations; i++) {

            // the input to inverting is always the previous output from analysis
            LOG.info("Analysis: Starting iteration " + (i + 1) + " of " + numIterations);
            Path tempRank = new Path(linkRank + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
            fs.mkdirs(tempRank);
            Path tempInverted = new Path(tempRank, "inverted");
            Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR);

            // run invert and analysis
            runInverter(nodeDb, wgOutlinkDb, loopDb, tempInverted);
            runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations, rankOneScore);

            // replace the temporary NodeDb with the output from analysis
            LOG.info("Analysis: Installing new link scores");
            FSUtils.replace(fs, linkRank, tempRank, true);
            LOG.info("Analysis: finished iteration " + (i + 1) + " of " + numIterations);
        }

        // replace the NodeDb in the WebGraph with the final output of analysis
        LOG.info("Analysis: Installing web graph nodes");
        FSUtils.replace(fs, wgNodeDb, nodeDb, true);

        // remove the temporary link rank folder
        fs.delete(linkRank, true);
        long end = System.currentTimeMillis();
        LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new LinkRank(), args);
        System.exit(res);
    }

    /**
     * Runs the LinkRank tool.
     */
    public int run(String[] args) throws Exception {

        Options options = new Options();
        Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message")
                .create("help");
        Option webgraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
                .withDescription("the web graph db to use").create("webgraphdb");
        options.addOption(helpOpts);
        options.addOption(webgraphOpts);

        CommandLineParser parser = new GnuParser();
        try {

            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("webgraphdb")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("LinkRank", options);
                return -1;
            }

            String webGraphDb = line.getOptionValue("webgraphdb");

            analyze(new Path(webGraphDb));
            return 0;
        } catch (Exception e) {
            LOG.error("LinkAnalysis: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}