edu.umd.cloud9.webgraph.BuildReverseWebGraph.java Source code

Java tutorial

Introduction

Here is the source code for edu.umd.cloud9.webgraph.BuildReverseWebGraph.java

Source

/*
 * Cloud9: A MapReduce Library for Hadoop
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.umd.cloud9.webgraph;

import java.io.IOException;
import java.util.Collections;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.log4j.Logger;

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.array.ArrayListOfInts;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;

/**
 *
 * @author Nima Asadi
 *
 */
public class BuildReverseWebGraph extends PowerTool {
    private static final Logger LOG = Logger.getLogger(BuildReverseWebGraph.class);

    public static class Reduce extends MapReduceBase
            implements Reducer<Text, ArrayListWritable<AnchorText>, IntWritable, ArrayListWritable<AnchorText>> {
        private static final IntWritable keyWord = new IntWritable();
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<AnchorText>();
        private static final ArrayListOfInts docnos = new ArrayListOfInts();
        private static ArrayListWritable<AnchorText> packet;
        private static boolean pushed;
        private int indegree;

        public void reduce(Text key, Iterator<ArrayListWritable<AnchorText>> values,
                OutputCollector<IntWritable, ArrayListWritable<AnchorText>> output, Reporter reporter)
                throws IOException {
            docnos.clear();
            arrayList.clear();
            indegree = 0;

            while (values.hasNext()) {
                packet = values.next();

                for (AnchorText data : packet) {
                    //docno field data
                    if (data.isDocnoField()) {
                        //in theory, there must be only one "docno" packet.
                        //Unless there are duplicate pages.
                        for (int docno : data) {
                            docnos.add(docno);
                        }
                        continue;
                    }

                    pushed = false;
                    indegree += data.getSize();

                    for (int i = 0; i < arrayList.size(); i++) {
                        if (arrayList.get(i).equalsIgnoreSources(data)) {
                            arrayList.get(i).addDocumentsFrom(data);
                            pushed = true;
                            break;
                        }
                    }

                    if (!pushed) {
                        arrayList.add(data.clone());
                    }
                }
            }

            arrayList.add(new AnchorText(AnchorTextConstants.Type.IN_DEGREE.val, null, indegree));
            arrayList.add(new AnchorText(AnchorTextConstants.Type.URL_FIELD.val, key.toString()));
            Collections.sort(arrayList);

            //if there was no document number detected,
            //this record would not be emitted.
            for (int docno : docnos) {
                keyWord.set(docno);
                output.collect(keyWord, arrayList);
            }
        }
    }

    public static final String[] RequiredParameters = { "Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers",
            "Cloud9.Reducers" };

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildReverseWebGraph(Configuration conf) {
        super(conf);
    }

    public int runTool() throws Exception {
        JobConf conf = new JobConf(getConf(), BuildReverseWebGraph.class);
        FileSystem fs = FileSystem.get(conf);

        int numMappers = conf.getInt("Cloud9.Mappers", 1);
        int numReducers = conf.getInt("Cloud9.Reducers", 200);

        String inputPath = conf.get("Cloud9.InputPath");
        String outputPath = conf.get("Cloud9.OutputPath");

        conf.setJobName("ReverseWebGraph");
        conf.set("mapred.child.java.opts", "-Xmx2048m");
        conf.setInt("mapred.task.timeout", 60000000);
        conf.set("mapreduce.map.memory.mb", "2048");
        conf.set("mapreduce.map.java.opts", "-Xmx2048m");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
        conf.set("mapreduce.task.timeout", "60000000");

        conf.setNumMapTasks(numMappers);
        conf.setNumReduceTasks(numReducers);
        conf.setMapperClass(IdentityMapper.class);
        conf.setReducerClass(Reduce.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputValueClass(ArrayListWritable.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(ArrayListWritable.class);

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setCompressOutput(conf, true);
        SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

        SequenceFileInputFormat.setInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

        LOG.info("BuildReverseWebGraph");
        LOG.info(" - input path: " + inputPath);
        LOG.info(" - output path: " + outputPath);

        if (!fs.exists(new Path(outputPath))) {
            JobClient.runJob(conf);
        } else {
            LOG.info(outputPath + " already exists! Skipping this step...");
        }

        return 0;
    }
}