org.apache.nutch.indexer.field.AnchorFields.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.indexer.field.AnchorFields.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.indexer.field;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Random;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.scoring.webgraph.LinkDatum;
import org.apache.nutch.scoring.webgraph.Node;
import org.apache.nutch.scoring.webgraph.WebGraph;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;

/**
 * Creates FieldWritable objects for inbound anchor text.   These FieldWritable
 * objects are then included in the input to the FieldIndexer to be converted
 * to Lucene Field objects and indexed.
 * 
 * Any empty or null anchor text is ignored. Anchors are sorted in descending
 * order according to the score of their parent pages. There are settings for a
 * maximum number of anchors to index and whether those anchors should be stored
 * and tokenized. With a descending order by score and a maximum anchors index
 * we ensure that only the best anchors are indexed assuming that a higher link
 * analysis score equals a better page and better inbound text.
 */
public class AnchorFields extends Configured implements Tool {

    public static final Log LOG = LogFactory.getLog(AnchorFields.class);

    /**
     * Comparator to order the links in descending order by score.
     */
    private static class DescendinLinkDatumScoreComparator implements Comparator<LinkDatum> {

        public int compare(LinkDatum one, LinkDatum two) {
            float scoreOne = one.getScore();
            float scoreTwo = two.getScore();
            return (scoreOne == scoreTwo ? 0 : (scoreOne > scoreTwo ? -1 : 1));
        }
    }

    /**
     * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
     * and null anchors.
     * 
     * @param webGraphDb The WebGraphDb to pull from.
     * @param output The extractor output.
     * 
     * @throws IOException If an error occurs while running the extractor.
     */
    private void runExtractor(Path webGraphDb, Path output) throws IOException {

        JobConf extractor = new NutchJob(getConf());
        extractor.setJobName("AnchorFields Extractor");
        FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR));
        FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR));
        FileOutputFormat.setOutputPath(extractor, output);
        extractor.setInputFormat(SequenceFileInputFormat.class);
        extractor.setMapperClass(Extractor.class);
        extractor.setReducerClass(Extractor.class);
        extractor.setMapOutputKeyClass(Text.class);
        extractor.setMapOutputValueClass(ObjectWritable.class);
        extractor.setOutputKeyClass(Text.class);
        extractor.setOutputValueClass(LinkDatum.class);
        extractor.setOutputFormat(SequenceFileOutputFormat.class);

        LOG.info("Starting extractor job");
        try {
            JobClient.runJob(extractor);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished extractor job.");
    }

    /**
     * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
     * the highest scoring into FieldWritable objects.  Only inlinks for which
     * basic fields exist will be collected to avoid orphan fields.
     * 
     * @param basicFields The BasicFields which must be present to collect anchors
     * to avoid orphan fields.
     * @param links The outlinks path.
     * @param output The collector output.
     * 
     * @throws IOException If an error occurs while running the collector.
     */
    private void runCollector(Path basicFields, Path links, Path output) throws IOException {

        JobConf collector = new NutchJob(getConf());
        collector.setJobName("AnchorFields Collector");
        FileInputFormat.addInputPath(collector, links);
        FileInputFormat.addInputPath(collector, basicFields);
        FileOutputFormat.setOutputPath(collector, output);
        collector.setInputFormat(SequenceFileInputFormat.class);
        collector.setMapOutputKeyClass(Text.class);
        collector.setMapOutputValueClass(ObjectWritable.class);
        collector.setMapperClass(Collector.class);
        collector.setReducerClass(Collector.class);
        collector.setOutputKeyClass(Text.class);
        collector.setOutputValueClass(FieldWritable.class);
        collector.setOutputFormat(SequenceFileOutputFormat.class);

        LOG.info("Starting collector job");
        try {
            JobClient.runJob(collector);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished collector job.");
    }

    /**
     * Extracts outlinks to be created as FieldWritable objects.  Ignores empty
     * and null anchors.
     */
    public static class Extractor extends Configured implements Mapper<Text, Writable, Text, ObjectWritable>,
            Reducer<Text, ObjectWritable, Text, LinkDatum> {

        private boolean ignoreEmptyAnchors = true;
        private JobConf conf;

        /**
         * Default constructor.
         */
        public Extractor() {
        }

        /**
         * Configurable constructor.
         */
        public Extractor(Configuration conf) {
            setConf(conf);
        }

        /**
         * Configures the job, sets to ignore empty anchors.
         */
        public void configure(JobConf conf) {
            this.conf = conf;
            ignoreEmptyAnchors = conf.getBoolean("link.ignore.empty.anchors", true);
        }

        /**
         * Wraps values in ObjectWritable
         */
        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            objWrite.set(value);
            output.collect(key, objWrite);
        }

        /**
         * Extracts and inverts outlinks, ignores empty anchors.
         */
        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, LinkDatum> output,
                Reporter reporter) throws IOException {

            List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
            Node node = null;

            // collect the outlinks while ignoring links with empty anchor text, also
            // assign the node
            while (values.hasNext()) {
                ObjectWritable objWrite = values.next();
                Object obj = objWrite.get();
                if (obj instanceof LinkDatum) {
                    LinkDatum next = (LinkDatum) obj;
                    String anchor = next.getAnchor();
                    if (anchor != null) {
                        anchor = anchor.trim();
                    }
                    if (ignoreEmptyAnchors && (anchor == null || anchor.length() == 0)) {
                        continue;
                    }
                    outlinkList.add(next);
                } else if (obj instanceof Node) {
                    node = (Node) obj;
                }
            }

            // has to have outlinks to index
            if (node != null && outlinkList.size() > 0) {
                String fromUrl = key.toString();
                float outlinkScore = node.getInlinkScore();
                for (LinkDatum datum : outlinkList) {
                    String toUrl = datum.getUrl();
                    datum.setUrl(fromUrl);
                    datum.setScore(outlinkScore);
                    datum.setLinkType(LinkDatum.INLINK);
                    output.collect(new Text(toUrl), datum);
                }
            }
        }

        public void close() {
        }
    }

    /**
     * Collects and creates FieldWritable objects from the inlinks. Inlinks are
     * first sorted by descending score before being collected.
     */
    public static class Collector extends Configured implements Mapper<Text, Writable, Text, ObjectWritable>,
            Reducer<Text, ObjectWritable, Text, FieldWritable> {

        private int maxInlinks = 1000;
        private boolean tokenize = true;
        private boolean stored = false;
        private Comparator<LinkDatum> descLinkComp = new DescendinLinkDatumScoreComparator();

        /**
         * Configures the jobs. Sets maximum number of inlinks and whether to
         * tokenize and store.
         */
        public void configure(JobConf conf) {
            this.maxInlinks = conf.getInt("link.max.inlinks", 1000);
            this.tokenize = conf.getBoolean("indexer.anchor.tokenize", true);
            this.stored = conf.getBoolean("indexer.anchor.stored", false);
        }

        public void close() {
        }

        /**
         * Wraps values in ObjectWritable
         */
        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            objWrite.set(value);
            output.collect(key, objWrite);
        }

        /**
         * Aggregates and sorts inlinks. Then converts up to a max number to
         * FieldWritable objects.
         */
        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, FieldWritable> output,
                Reporter reporter) throws IOException {

            List<LinkDatum> anchors = new ArrayList<LinkDatum>();
            FieldsWritable basicFields = null;

            // aggregate inlinks assign basic fields
            while (values.hasNext()) {
                ObjectWritable objWrite = values.next();
                Object obj = objWrite.get();
                if (obj instanceof LinkDatum) {
                    anchors.add((LinkDatum) obj);
                } else if (obj instanceof FieldsWritable) {
                    basicFields = (FieldsWritable) obj;
                }
            }

            // only collect anchors for those urls that have basic fields, otherwise
            // we get orphan entries indexed only under anchor text
            if (basicFields != null && anchors.size() > 0) {

                // sort according to score descending
                Collections.sort(anchors, descLinkComp);

                // collect to maximum number of inlinks
                int numToCollect = (maxInlinks > anchors.size() ? anchors.size() : maxInlinks);
                for (int i = 0; i < numToCollect; i++) {
                    LinkDatum datum = anchors.get(i);
                    FieldWritable anchorField = new FieldWritable(Fields.ANCHOR, datum.getAnchor(),
                            FieldType.CONTENT, true, stored, tokenize);
                    output.collect(key, anchorField);
                }
            }
        }
    }

    /**
     * Creates the FieldsWritable object from the anchors.
     * 
     * @param webGraphDb The WebGraph from which to pull outlinks.
     * @param basicFields The BasicFields that must be present to avoid orphan
     * anchor fields.
     * @param output The AnchorFields output.
     * 
     * @throws IOException If an error occurs while creating the fields.
     */
    public void createFields(Path webGraphDb, Path basicFields, Path output) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("AnchorFields: starting at " + sdf.format(start));

        Configuration conf = getConf();
        FileSystem fs = FileSystem.get(conf);
        Path tempLinks = new Path(output + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        runExtractor(webGraphDb, tempLinks);
        runCollector(basicFields, tempLinks, output);
        fs.delete(tempLinks, true);
        long end = System.currentTimeMillis();
        LOG.info("AnchorFields: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new AnchorFields(), args);
        System.exit(res);
    }

    /**
     * Runs the AnchorFields job.
     */
    public int run(String[] args) throws Exception {

        Options options = new Options();
        Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message")
                .create("help");
        Option outputOpts = OptionBuilder.withArgName("output").hasArg()
                .withDescription("the output index directory").create("output");
        Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
                .withDescription("the webgraphdb to use").create("webgraphdb");
        Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArgs()
                .withDescription("the basicfields to use").create("basicfields");
        options.addOption(helpOpts);
        options.addOption(webGraphDbOpts);
        options.addOption(basicFieldOpts);
        options.addOption(outputOpts);

        CommandLineParser parser = new GnuParser();
        try {

            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("webgraphdb") || !line.hasOption("output")
                    || !line.hasOption("basicfields")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("AnchorFields", options);
                return -1;
            }

            String webGraphDb = line.getOptionValue("webgraphdb");
            String output = line.getOptionValue("output");
            String basicFields = line.getOptionValue("basicfields");

            createFields(new Path(webGraphDb), new Path(basicFields), new Path(output));
            return 0;
        } catch (Exception e) {
            LOG.fatal("AnchorFields: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}