org.apache.nutch.indexer.field.CustomFields.java Source code

Introduction

Here is the source code for org.apache.nutch.indexer.field.CustomFields.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.indexer.field;

import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;

/**
 * Creates custom FieldWritable objects from a text file containing field
 * information including field name, value, and optional boost and fields type
 * (as needed by FieldWritable objects).
 * 
 * An input text file to CustomFields would be tab separated and would look
 * similar to this:
 * 
 * <pre> 
 * http://www.apache.org\tlang\ten\t5.0\tCONTENT
 * http://lucene.apache.org\tlang\tde
 * </pre>
 * 
 * The only required fields are url, name and value. Custom fields are
 * configured through the custom-fields.xml file in the classpath. The config
 * file allow you to set defaults for whether a field is indexed, stored, and
 * tokenized, boosts on a field, and whether a field can output multiple values
 * under the same key.
 * 
 * The purpose of the CustomFields job is to allow better integration with
 * technologies such as Hadoop Streaming. Streaming jobs can be created in any
 * programming language, can output the text file needed by the CustomFields
 * job, and those fields can then be included in the index.
 * 
 * The concept of custom fields requires two separate pieces. The indexing piece
 * and the query piece. The indexing piece is handled by the CustomFields job.
 * The query piece is handled by the query-custom plugin.
 * 
 * <b>Important:</b><br> <i>Currently, because of the way the query plugin
 * architecture works, custom fields names must be added to the fields parameter
 * in the query-custom plugin plugin.xml file in order to be queried.</i>
 * 
 * The CustomFields tool accepts one or more directories containing text files
 * in the appropriate custom field format. These files are then turned into
 * FieldWritable objects to be included in the index.
 */
public class CustomFields extends Configured implements Tool {

    public static final Log LOG = LogFactory.getLog(CustomFields.class);

    /**
     * MapReduce job that converts text values into FieldWritable objects.
     * 
     * @param inputs The directories with text files to convert.
     * @param output The converter output directory.
     * 
     * @throws IOException If an error occurs while converting.
     */
    private void runConverter(Path[] inputs, Path output) throws IOException {

        JobConf converter = new NutchJob(getConf());
        converter.setJobName("CustomFields Converter");
        for (int i = 0; i < inputs.length; i++) {
            FileInputFormat.addInputPath(converter, inputs[i]);
        }
        FileOutputFormat.setOutputPath(converter, output);
        converter.setInputFormat(TextInputFormat.class);
        converter.setMapperClass(Converter.class);
        converter.setReducerClass(Converter.class);
        converter.setMapOutputKeyClass(Text.class);
        converter.setMapOutputValueClass(FieldWritable.class);
        converter.setOutputKeyClass(Text.class);
        converter.setOutputValueClass(FieldWritable.class);
        converter.setOutputFormat(SequenceFileOutputFormat.class);

        LOG.info("Starting converter job");
        try {
            JobClient.runJob(converter);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished converter job.");
    }

    /**
     * Aggregated multiple FieldWritable objects with the same name. Depending on
     * settings in the custom-fields.xml file, a field may one or more fields.
     * This jobs aggregates fields and then collects based on the configuration
     * settings.
     * 
     * @param basicFields The basicfields FieldWritable objects.
     * @param converted The converted custom field objects.
     * @param output The final output directory for custom field objects.
     * 
     * @throws IOException If an error occurs while converting.
     */
    private void runCollector(Path basicFields, Path converted, Path output) throws IOException {

        JobConf collector = new NutchJob(getConf());
        collector.setJobName("CustomFields Collector");
        FileInputFormat.addInputPath(collector, converted);
        FileInputFormat.addInputPath(collector, basicFields);
        FileOutputFormat.setOutputPath(collector, output);
        collector.setInputFormat(SequenceFileInputFormat.class);
        collector.setMapOutputKeyClass(Text.class);
        collector.setMapOutputValueClass(ObjectWritable.class);
        collector.setMapperClass(Collector.class);
        collector.setReducerClass(Collector.class);
        collector.setOutputKeyClass(Text.class);
        collector.setOutputValueClass(FieldWritable.class);
        collector.setOutputFormat(SequenceFileOutputFormat.class);

        LOG.info("Starting collector job");
        try {
            JobClient.runJob(collector);
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
        LOG.info("Finished collector job.");
    }

    /**
     * Converts text values into FieldWritable objects.
     */
    public static class Converter extends Configured implements Mapper<LongWritable, Text, Text, FieldWritable>,
            Reducer<Text, FieldWritable, Text, FieldWritable> {

        private JobConf conf;
        private Map<String, boolean[]> flagMap = new HashMap<String, boolean[]>();
        private Set<String> multiFields = new HashSet<String>();

        public Converter() {
        }

        public Converter(Configuration conf) {
            setConf(conf);
        }

        public void configure(JobConf conf) {

            try {

                // get the file system and the configuration file from the classpath
                this.conf = conf;
                FileSystem fs = FileSystem.get(conf);
                String configFile = conf.get("custom.fields.config", "custom-fields.xml");
                LOG.info("Reading configuration field configuration from " + configFile);
                Properties customFieldProps = new Properties();
                InputStream fis = conf.getConfResourceAsInputStream(configFile);
                if (fis == null) {
                    throw new IOException("Was unable to open " + configFile);
                }

                // load the configuration file as properties
                customFieldProps.loadFromXML(fis);

                // loop through the properties setting field flags
                Enumeration propKeys = customFieldProps.keys();
                while (propKeys.hasMoreElements()) {
                    String prop = (String) propKeys.nextElement();
                    if (prop.endsWith(".name")) {
                        String propName = prop.substring(0, prop.length() - 5);
                        String name = customFieldProps.getProperty(prop);

                        String indexedProp = customFieldProps.getProperty(propName + ".indexed");
                        String storedProp = customFieldProps.getProperty(propName + ".stored");
                        String tokProp = customFieldProps.getProperty(propName + ".tokenized");
                        boolean indexed = (indexedProp.equalsIgnoreCase("yes")
                                || indexedProp.equalsIgnoreCase("true") || indexedProp.equalsIgnoreCase("on"));
                        boolean stored = (storedProp.equalsIgnoreCase("yes") || storedProp.equalsIgnoreCase("true")
                                || storedProp.equalsIgnoreCase("on"));
                        boolean tokenized = (tokProp.equalsIgnoreCase("yes") || tokProp.equalsIgnoreCase("true")
                                || tokProp.equalsIgnoreCase("on"));
                        boolean[] flags = { indexed, stored, tokenized };
                        flagMap.put(name, flags);

                        String multiProp = customFieldProps.getProperty(propName + ".multi");
                        boolean multi = (multiProp.equalsIgnoreCase("yes") || multiProp.equalsIgnoreCase("true")
                                || multiProp.equalsIgnoreCase("on"));
                        if (multi) {
                            multiFields.add(name);
                        }
                    }
                }
            } catch (Exception e) {
                LOG.error("Error loading custom field properties:\n" + StringUtils.stringifyException(e));
            }
        }

        public void map(LongWritable key, Text value, OutputCollector<Text, FieldWritable> output,
                Reporter reporter) throws IOException {

            // split the file on tabs
            String line = value.toString();
            String[] fields = line.split("\t");
            if (fields.length >= 3) {

                // fields must be in a specific order, default values for optional fields
                String url = fields[0];
                String fieldName = fields[1];
                String fieldVal = fields[2];
                String fieldScore = (fields.length > 3 ? fields[3] : null);
                String fieldType = (fields.length > 4 ? fields[4] : "CONTENT").toUpperCase();

                // creates the FieldWritable objects and collects
                boolean[] flags = flagMap.get(fieldName);
                if (flags != null) {
                    FieldWritable field = null;
                    if (fieldScore != null) {
                        field = new FieldWritable(fieldName, fieldVal, FieldType.valueOf(fieldType),
                                Float.parseFloat(fieldScore), flags[0], flags[1], flags[2]);
                    } else {
                        field = new FieldWritable(fieldName, fieldVal, FieldType.valueOf(fieldType), flags[0],
                                flags[1], flags[2]);
                    }
                    output.collect(new Text(url), field);
                }
            }
        }

        public void reduce(Text key, Iterator<FieldWritable> values, OutputCollector<Text, FieldWritable> output,
                Reporter reporter) throws IOException {

            // if multiple fields are allowed collect all of them, if not allowed
            // and multiple fields are present all of the values are ignored
            Set<String> multiSet = new HashSet<String>();
            while (values.hasNext()) {
                FieldWritable field = values.next();
                String name = field.getName();
                boolean isMulti = multiFields.contains(name);
                if (isMulti || (!isMulti && !multiSet.contains(name))) {
                    output.collect(key, field);
                    multiSet.add(name);
                } else {
                    LOG.info("Ignoring multiple " + name + " fields for " + key.toString());
                }
            }
        }

        public void close() {
        }
    }

    /**
     * Aggregates FieldWritable objects by the same name for the same URL.  These
     * objects are them filtered for multiple values against configuration 
     * settings.
     */
    public static class Collector extends Configured implements Mapper<Text, Writable, Text, ObjectWritable>,
            Reducer<Text, ObjectWritable, Text, FieldWritable> {

        private JobConf conf;

        public void configure(JobConf conf) {
            this.conf = conf;
        }

        public void close() {
        }

        public void map(Text key, Writable value, OutputCollector<Text, ObjectWritable> output, Reporter reporter)
                throws IOException {

            ObjectWritable objWrite = new ObjectWritable();
            objWrite.set(value);
            output.collect(key, objWrite);
        }

        public void reduce(Text key, Iterator<ObjectWritable> values, OutputCollector<Text, FieldWritable> output,
                Reporter reporter) throws IOException {

            FieldsWritable basicFields = null;
            List<FieldWritable> customFields = new ArrayList<FieldWritable>();

            while (values.hasNext()) {
                ObjectWritable objWrite = values.next();
                Object obj = objWrite.get();
                if (obj instanceof FieldWritable) {
                    customFields.add((FieldWritable) obj);
                } else if (obj instanceof FieldsWritable) {
                    basicFields = (FieldsWritable) obj;
                }
            }

            if (basicFields != null && customFields.size() > 0) {
                for (int i = 0; i < customFields.size(); i++) {
                    output.collect(key, customFields.get(i));
                }
            }
        }
    }

    void createFields(Path basicFields, Path[] inputs, Path output) throws IOException {

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("CustomerFields: starting at " + sdf.format(start));

        Configuration conf = getConf();
        FileSystem fs = FileSystem.get(conf);
        Path tempFields = new Path(output + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        runConverter(inputs, tempFields);
        runCollector(basicFields, tempFields, output);
        fs.delete(tempFields, true);
        long end = System.currentTimeMillis();
        LOG.info("CommonFields: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new CustomFields(), args);
        System.exit(res);
    }

    /**
     * Runs the CustomFields job.
     */
    public int run(String[] args) throws Exception {

        Options options = new Options();
        Option helpOpts = OptionBuilder.withArgName("help").withDescription("show this help message")
                .create("help");
        Option outputOpts = OptionBuilder.withArgName("output").hasArg()
                .withDescription("the output index directory").create("output");
        Option inputOpts = OptionBuilder.withArgName("input").hasArgs()
                .withDescription("the input directories with text field files").create("input");
        Option basicFieldOpts = OptionBuilder.withArgName("basicfields").hasArg()
                .withDescription("the basicfields to use").create("basicfields");
        options.addOption(helpOpts);
        options.addOption(inputOpts);
        options.addOption(basicFieldOpts);
        options.addOption(outputOpts);

        CommandLineParser parser = new GnuParser();
        try {

            CommandLine line = parser.parse(options, args);
            if (line.hasOption("help") || !line.hasOption("output") || !line.hasOption("basicfields")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.printHelp("CustomFields", options);
                return -1;
            }

            String[] inputs = line.getOptionValues("input");
            Path[] inputPaths = new Path[inputs.length];
            for (int i = 0; i < inputs.length; i++) {
                inputPaths[i] = new Path(inputs[i]);
            }
            String output = line.getOptionValue("output");
            String basicFields = line.getOptionValue("basicfields");

            createFields(new Path(basicFields), inputPaths, new Path(output));
            return 0;
        } catch (Exception e) {
            LOG.fatal("CustomFields: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}