Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:nl.utwente.bigdata.shouting.ShoutingExtactor.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: exampleTwitter <in> [<in>...] <out>");
        System.exit(2);//  w  ww  . ja  v a2  s.com
    }
    Job job = new Job(conf, "Extract Shouting Words");
    job.setJarByClass(ShoutingExtactor.class);
    job.setMapperClass(MapReducers.ShoutingWordsMapper.class);
    job.setCombinerClass(MapReducers.CounterReducer.class);
    job.setReducerClass(MapReducers.CounterReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:nl.utwente.bigdata.TemplateTool.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(TemplateTool.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {//from ww w  .ja v a 2 s . c om
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    // job.setNumReduceTasks(100);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.bigdata.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(CountMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:nl.utwente.mirex.AnchorExtract.java

License:Open Source License

/**
 * Runs the MapReduce job "anchor text extraction"
 * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts
 * @usage. //from  w w w . jav  a 2 s .c  o m
 * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/&#x2a;/ /user/hadoop/ClueWeb09_Anchors </code> 
 */
public static void main(String[] args) throws Exception {
    // Set job configuration
    Configuration conf = new Configuration();
    conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout
    Job job = new Job(conf, "AnchorExtract");
    job.setJarByClass(AnchorExtract.class);

    if (args.length != 2) {
        System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName());
        System.out.println("          inputFiles: path to data");
        System.out.println("          outputFile: directory where anchor text is stored");
        System.exit(1);
    }
    int argc = 0;
    String inputFiles = args[argc++];
    String outputFile = args[argc++];

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(Combine.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(WarcFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list.
    FileOutputFormat.setOutputPath(job, new Path(outputFile));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.waitForCompletion(true);
}

From source file:nl.utwente.mirex.QueryTermCount.java

License:Open Source License

/**
* Configure the Hadoop job//  w w  w  .j av a  2  s  .co m
* @throws IOException 
*/
public static Job configureJob(String jobName, String format, Path inputFile, Path tempOut, Path topicFile)
        throws IOException, InvalidParameterException {
    // Set job configuration
    Job job = new Job();
    job.setJobName(jobName);
    job.setJarByClass(QueryTermCount.class);

    // Set intermediate output (override defaults)
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    // Set output (override defaults)
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set map-reduce classes
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    // Set input-output format
    if (format.equals("KEYVAL")) {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    } else if (format.equals("WARC")) {
        job.setInputFormatClass(WarcTextConverterInputFormat.class);
    } else {
        throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL");
    }
    job.setOutputFormatClass(TextOutputFormat.class);
    // also works withoput
    //conf.set("mapred.output.compress", false);
    job.setNumReduceTasks(1);

    // Set input-output paths
    FileInputFormat.setInputPaths(job, inputFile);
    FileOutputFormat.setOutputPath(job, tempOut);

    // Set job specific distributed cache file (query file)
    DistributedCache.addCacheFile(topicFile.toUri(), job.getConfiguration());

    return job;
}

From source file:nl.utwente.mirex.TrecRun.java

License:Open Source License

/**
 * Runs the MapReduce job "trec run"/*  w ww.  ja  v  a 2s  .  c om*/
 * @param args 0: path to parsed document collection (use AnchorExtract); 1: (non-existing) path that will contain run results; 2: TREC query file
 * @usage. see README.html 
 */
public static void main(String[] args) throws Exception {

    if (args.length != 3 && args.length != 4) {
        System.out.printf("Usage: %s [inputFormat] inputFiles outputFile topicFile \n",
                TrecRun.class.getSimpleName());
        System.out.println("          inputFormat: either WARC or KEYVAL; default WARC");
        System.out.println("          inputFiles: the WARC files");
        System.out.println("          outputFiles: output directory");
        System.out.println("          topicFile: topic descriptions (one query per line)");
        System.exit(1);
    }
    int argc = 0;
    String inputFormat = "WARC";
    if (args.length > 3) {
        inputFormat = args[argc++].toUpperCase();
    }
    String inputFiles = args[argc++];
    String outputFile = args[argc++];
    String topicFile = args[argc++];

    // Set job configuration
    Job job = new Job();
    job.setJobName("MirexTrecRun");
    job.setJarByClass(TrecRun.class);

    // Set intermediate output (override defaults)
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // Set output (override defaults)
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Set map-reduce classes
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    // Set input-output format
    if (inputFormat.equals("KEYVAL")) {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    } else if (inputFormat.equals("WARC")) {
        job.setInputFormatClass(WarcTextConverterInputFormat.class);
    } else {
        throw new InvalidParameterException("inputFormat must be either WARC or KEYVAL");
    }
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set input-output paths
    FileInputFormat.setInputPaths(job, new Path(inputFiles));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    // Set job specific distributed cache file (query file)
    DistributedCache.addCacheFile(new Path(topicFile).toUri(), job.getConfiguration());

    // Run the job
    job.waitForCompletion(true);
}

From source file:nl.utwente.mirex.TrecRunBaselines.java

License:Open Source License

/**
 * Runs the MapReduce job "trec baseline runs"
 * @param args 0: path to parsed document collection (use AnchorExtract); 1: (non-existing) path that will contain run resutls; 2: MIREX query file
 * @usage. //from   w  ww.j a v a  2 s  .  c om
 * <code> % hadoop jar mirex-0.2.jar nl.utwente.mirex.TrecRunBaselines /user/hadoop/ClueWeb09_Anchors/* /user/hadoop/BaselineOut /user/hadoop/wt09-topics-stats.txt </code> 
 */
public static void main(String[] args) throws Exception {
    if (args.length != 3 && args.length != 4) {
        System.out.printf("Usage: %s [inputFormat] inputFiles topicFile outputFile\n",
                TrecRun.class.getSimpleName());
        System.out.println("          inputFormat: either WARC or KEYVAL; default WARC");
        System.exit(1);
    }

    int argc = 0;
    String inputFormat = "WARC";
    if (args.length > 3) {
        inputFormat = args[argc++].toUpperCase();
    }
    String inputFiles = args[argc++];
    String outputFile = args[argc++];
    String topicFile = args[argc++];

    // Set job configuration
    Job job = new Job();
    job.setJobName("MirexBaselineRuns");
    job.setJarByClass(TrecRunBaselines.class);

    // Set intermediate output (override defaults)
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // Set output (override defaults)
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Set map-reduce classes
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    // Set input-output format
    if (inputFormat.equals("KEYVAL")) {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    } else if (inputFormat.equals("WARC")) {
        job.setInputFormatClass(WarcTextConverterInputFormat.class);
    } else {
        throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL");
    }
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set input-output paths
    FileInputFormat.setInputPaths(job, new Path(inputFiles));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    // Set job specific distributed cache file (query file)
    DistributedCache.addCacheFile(new Path(topicFile).toUri(), job.getConfiguration());

    // Run the job
    job.waitForCompletion(true);
}

From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDay.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(CarCountPerRoadPerDay.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {/* w w  w  .j a  v a  2  s.c  o m*/
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDayIncreasedValidity.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(CarCountPerRoadPerDayIncreasedValidity.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {//from  w  w w  . j  a va 2s  .c o  m
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);
    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.ReadingsPerSensor.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(ReadingsPerSensor.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {/*w ww. ja  va  2  s.c  o  m*/
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}