redpoll.text.TfIdfDriver.java Source code

Introduction

Here is the source code for redpoll.text.TfIdfDriver.java
Source

/** 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package redpoll.text;

import java.io.IOException;
import java.util.HashMap;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.GenericsUtil;

/**
 * The Driver which drives the Tf-Idf based vector space model generation.
 * @author Jeremy Chow(coderplay@gmail.com)
 */
public class TfIdfDriver {

    /**
     * Run the job
     * 
     * @param input the input pathname String
     * @param output the output pathname String
     */
    public static void runJob(String input, String output) throws IOException {
        JobClient client = new JobClient();
        JobConf conf = new JobConf(TfIdfDriver.class);

        FileSystem fs = FileSystem.get(conf);
        Path inPath = new Path(input + "/tf");
        FileInputFormat.setInputPaths(conf, inPath);
        Path outPath = new Path(output);
        FileOutputFormat.setOutputPath(conf, outPath);

        conf.setMapperClass(TfIdfMapper.class);
        conf.setReducerClass(TfIdfReducer.class);
        //conf.setNumMapTasks(10);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(TfIdfWritable.class);
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(TfIdfOutputFormat.class);

        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                + "org.apache.hadoop.io.serializer.WritableSerialization");
        // serialize a term hashmap. Its key is the term , value is a term index of
        // the term vector.    
        Path dfpath = new Path(input + "/df/part-00000");
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dfpath, conf);
        Text key = new Text();
        IntWritable value = new IntWritable();
        HashMap<String, Integer> termMap = new HashMap<String, Integer>();
        int index = 0;
        while ((reader.next(key, value))) {
            String termString = key.toString();
            if (!termString.equals("redpoll.docs.num")) {
                termMap.put(key.toString(), index);
                index++;
            } else {
                conf.setInt("redpoll.docs.num", value.get());
            }
        }
        reader.close();
        DefaultStringifier<HashMap<String, Integer>> mapStringifier = new DefaultStringifier<HashMap<String, Integer>>(
                conf, GenericsUtil.getClass(termMap));
        String termMapString = mapStringifier.toString(termMap);
        conf.setInt("redpoll.text.terms.num", index); // number of terms
        conf.set("redpoll.text.terms", termMapString);

        client.setConf(conf);
        JobClient.runJob(conf);
    }

    /**
     * for test
     */
    public static void main(String[] args) throws IOException {
        String input = args[0];
        String output = args[1];
        runJob(input, output);
    }
}