DistribCountingDriver.java Source code

Java tutorial

Introduction

Here is the source code for DistribCountingDriver.java

Source

/*
 * Copyright 2012-14 Justin A. Debrabant <debrabant@cs.brown.edu> and Matteo Riondato <matteo@cs.brown.edu>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class DistribCountingDriver extends Configured implements Tool {

    public static void main(String args[]) throws Exception {
        if (args.length < 4) {
            System.out.println(
                    "usage: java DistribCountingDriver <minFreqPercent> <datasetSize> <path to input database> <path to output global FIs>");
            System.exit(1);
        }

        int res = ToolRunner.run(new DistribCountingDriver(), args);

        System.exit(res);
    }

    public int run(String args[]) throws Exception {
        long job_start_time, job_end_time;
        long job_runtime;

        JobConf conf = new JobConf(getConf());

        int minFreqPercent = Integer.parseInt(args[0]);
        int datasetSize = Integer.parseInt(args[1]);
        conf.setInt("DISTRCOUNT.datasetSize", datasetSize);
        conf.setInt("DISTRCOUNT.minFreqPercent", minFreqPercent);

        conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
        conf.setInt("mapred.task.timeout", 60000000);

        conf.setJarByClass(DistribCountingDriver.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(IntWritable.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(DistribCountingMapper.class);
        conf.setCombinerClass(DistribCountingCombiner.class);
        conf.setReducerClass(DistribCountingReducer.class);

        conf.setInputFormat(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(conf, new Path(args[2]));
        FileOutputFormat.setOutputPath(conf, new Path(args[3]));

        job_start_time = System.currentTimeMillis();
        JobClient.runJob(conf);
        job_end_time = System.currentTimeMillis();

        job_runtime = (job_end_time - job_start_time) / 1000;

        System.out.println("total job runtime (seconds): " + job_runtime);

        return 0;
    }

}