sa.edu.kaust.fwindex.BuildIntDocVectorsForwardIndex.java Source code

Introduction

Here is the source code for sa.edu.kaust.fwindex.BuildIntDocVectorsForwardIndex.java
Source

/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package sa.edu.kaust.fwindex;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

import sa.edu.kaust.io.PostingWritable;
import sa.edu.kaust.io.TermDF;

import edu.umd.cloud9.io.array.ArrayListWritable;

/**
 * <p>
 * A Hadoop MapReduce job that creates a forward index for an inverted index for a 
 * collection of documents.
 * </p>
 * 
 * <ul>
 * <li>[inv-index] path to the inverted index directory
 * <li>[output-file] path to the output file
 * </ul>
 * 
 * <p>
 * Here's a sample invocation:
 * </p>
 * 
 * <blockquote>
 * 
 * <pre>
 * hadoop jar cloud9.jar sa.edu.kaust.fwindex.BuildIntDocVectorsForwardIndex \
 * ./inv_index_output/ \
 * ./output_dir/forward
 * </pre>
 * 
 * </blockquote>
 * 
 * @author Abdurrahman Ahmed (adapted)
 */
@SuppressWarnings("deprecation")
public class BuildIntDocVectorsForwardIndex extends Configured implements Tool {

    private static final Logger sLogger = Logger.getLogger(BuildIntDocVectorsForwardIndex.class);

    protected static enum Dictionary {
        Size
    };

    private static class MyMapRunner
            implements MapRunnable<TermDF, ArrayListWritable<PostingWritable>, TermDF, Text> {

        private String mInputFile;
        private Text outputValue = new Text();

        public void configure(JobConf job) {
            mInputFile = job.get("map.input.file"); // get the filename of the split
        }

        public void run(RecordReader<TermDF, ArrayListWritable<PostingWritable>> input,
                OutputCollector<TermDF, Text> output, Reporter reporter) throws IOException {
            TermDF key = input.createKey();
            ArrayListWritable<PostingWritable> value = input.createValue();
            int fileNo = Integer.parseInt(mInputFile.substring(mInputFile.lastIndexOf("-") + 1)); // get file no

            long pos = input.getPos(); // get curent position
            while (input.next(key, value)) {
                outputValue.set(fileNo + "\t" + pos);

                output.collect(key, outputValue);
                reporter.incrCounter(Dictionary.Size, 1);

                pos = input.getPos();
            }
            sLogger.info("last termid: " + key + "(" + fileNo + ", " + pos + ")");
        }
    }

    public static final long BigNumber = 1000000000;

    private static class MyReducer extends MapReduceBase implements Reducer<TermDF, Text, Text, Text> {

        FSDataOutputStream mOut;

        public void configure(JobConf job) {
            FileSystem fs;
            try {
                fs = FileSystem.get(job);
            } catch (Exception e) {
                throw new RuntimeException("Error opening the FileSystem!");
            }

            String forwardIndexPath = job.get("ForwardIndexPath");

            try {
                mOut = fs.create(new Path(forwardIndexPath), true);
            } catch (Exception e) {
                throw new RuntimeException("Error in creating files!");
            }

        }

        public void reduce(TermDF key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            String[] s = values.next().toString().split("\\s+");

            if (values.hasNext())
                throw new RuntimeException("There shouldn't be more than one value, key=" + key);

            int fileNo = Integer.parseInt(s[0]);
            long filePos = Long.parseLong(s[1]);
            long pos = BigNumber * fileNo + filePos;

            //Atomically write both term and position to avoid intermingled writes
            //by different reducers.
            mOut.writeUTF(key.getK_gram()[0] + "\t" + pos);
        }

        public void close() throws IOException {
            mOut.close();
        }
    }

    private static int printUsage() {
        System.out.println("usage: [input] [output-dir] [mappings-file]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    /**
     * Runs this tool.
     */
    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            printUsage();
            return -1;
        }
        String inPath = args[0];
        String outPath = args[1];

        JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
        FileSystem fs = FileSystem.get(conf);

        int mapTasks = 10;
        sLogger.info("Tool: BuildIntDocVectorsIndex");

        String intDocVectorsPath = inPath;
        String forwardIndexPath = outPath;

        if (!fs.exists(new Path(intDocVectorsPath))) {
            sLogger.info("Error: IntDocVectors don't exist!");
            return 0;
        }

        if (fs.exists(new Path(forwardIndexPath))) {
            sLogger.info("IntDocVectorsForwardIndex already exists: skipping!");
            return 0;
        }

        conf.set("ForwardIndexPath", forwardIndexPath);

        conf.setJobName("BuildIntDocVectorsForwardIndex");

        Path inputPath = new Path(intDocVectorsPath);
        FileInputFormat.setInputPaths(conf, inputPath);

        conf.setNumMapTasks(mapTasks);
        conf.setNumReduceTasks(1);

        conf.set("mapred.child.java.opts", "-Xmx2048m");

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapOutputKeyClass(TermDF.class);
        conf.setMapOutputValueClass(Text.class);
        conf.setOutputFormat(NullOutputFormat.class);

        conf.setMapRunnerClass(MyMapRunner.class);
        conf.setReducerClass(MyReducer.class);

        JobClient.runJob(conf);

        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new BuildIntDocVectorsForwardIndex(), args);
        System.exit(res);
    }
}