Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:nl.utwente.bigdata.shouting.ShoutingExtactor.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: exampleTwitter <in> [<in>...] <out>");
        System.exit(2);//  w  ww  . ja  v a2  s.com
    }
    Job job = new Job(conf, "Extract Shouting Words");
    job.setJarByClass(ShoutingExtactor.class);
    job.setMapperClass(MapReducers.ShoutingWordsMapper.class);
    job.setCombinerClass(MapReducers.CounterReducer.class);
    job.setReducerClass(MapReducers.CounterReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:nl.utwente.bigdata.TemplateTool.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(TemplateTool.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {//from ww w  .ja v a 2 s . c om
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    // job.setNumReduceTasks(100);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.bigdata.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(CountMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:nl.utwente.mirex.AnchorExtract.java

License:Open Source License

/**
 * Runs the MapReduce job "anchor text extraction"
 * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts
 * @usage. //from  w w w . jav  a 2 s .c  o m
 * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/&#x2a;/ /user/hadoop/ClueWeb09_Anchors </code> 
 */
public static void main(String[] args) throws Exception {
    // Set job configuration
    Configuration conf = new Configuration();
    conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout
    Job job = new Job(conf, "AnchorExtract");
    job.setJarByClass(AnchorExtract.class);

    if (args.length != 2) {
        System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName());
        System.out.println("          inputFiles: path to data");
        System.out.println("          outputFile: directory where anchor text is stored");
        System.exit(1);
    }
    int argc = 0;
    String inputFiles = args[argc++];
    String outputFile = args[argc++];

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(Combine.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(WarcFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list.
    FileOutputFormat.setOutputPath(job, new Path(outputFile));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.waitForCompletion(true);
}

From source file:nl.utwente.mirex.QueryTermCount.java

License:Open Source License

/**
* Configure the Hadoop job//  w w  w  .j av a  2  s  .co m
* @throws IOException 
*/
public static Job configureJob(String jobName, String format, Path inputFile, Path tempOut, Path topicFile)
        throws IOException, InvalidParameterException {
    // Set job configuration
    Job job = new Job();
    job.setJobName(jobName);
    job.setJarByClass(QueryTermCount.class);

    // Set intermediate output (override defaults)
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    // Set output (override defaults)
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Set map-reduce classes
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    // Set input-output format
    if (format.equals("KEYVAL")) {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    } else if (format.equals("WARC")) {
        job.setInputFormatClass(WarcTextConverterInputFormat.class);
    } else {
        throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL");
    }
    job.setOutputFormatClass(TextOutputFormat.class);
    // also works withoput
    //conf.set("mapred.output.compress", false);
    job.setNumReduceTasks(1);

    // Set input-output paths
    FileInputFormat.setInputPaths(job, inputFile);
    FileOutputFormat.setOutputPath(job, tempOut);

    // Set job specific distributed cache file (query file)
    DistributedCache.addCacheFile(topicFile.toUri(), job.getConfiguration());

    return job;
}

From source file:nl.utwente.mirex.TrecRun.java

License:Open Source License

/**
 * Runs the MapReduce job "trec run"/*  w ww.  ja  v  a 2s  .  c om*/
 * @param args 0: path to parsed document collection (use AnchorExtract); 1: (non-existing) path that will contain run results; 2: TREC query file
 * @usage. see README.html 
 */
public static void main(String[] args) throws Exception {

    if (args.length != 3 && args.length != 4) {
        System.out.printf("Usage: %s [inputFormat] inputFiles outputFile topicFile \n",
                TrecRun.class.getSimpleName());
        System.out.println("          inputFormat: either WARC or KEYVAL; default WARC");
        System.out.println("          inputFiles: the WARC files");
        System.out.println("          outputFiles: output directory");
        System.out.println("          topicFile: topic descriptions (one query per line)");
        System.exit(1);
    }
    int argc = 0;
    String inputFormat = "WARC";
    if (args.length > 3) {
        inputFormat = args[argc++].toUpperCase();
    }
    String inputFiles = args[argc++];
    String outputFile = args[argc++];
    String topicFile = args[argc++];

    // Set job configuration
    Job job = new Job();
    job.setJobName("MirexTrecRun");
    job.setJarByClass(TrecRun.class);

    // Set intermediate output (override defaults)
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // Set output (override defaults)
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Set map-reduce classes
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    // Set input-output format
    if (inputFormat.equals("KEYVAL")) {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    } else if (inputFormat.equals("WARC")) {
        job.setInputFormatClass(WarcTextConverterInputFormat.class);
    } else {
        throw new InvalidParameterException("inputFormat must be either WARC or KEYVAL");
    }
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set input-output paths
    FileInputFormat.setInputPaths(job, new Path(inputFiles));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    // Set job specific distributed cache file (query file)
    DistributedCache.addCacheFile(new Path(topicFile).toUri(), job.getConfiguration());

    // Run the job
    job.waitForCompletion(true);
}

From source file:nl.utwente.mirex.TrecRunBaselines.java

License:Open Source License

/**
 * Runs the MapReduce job "trec baseline runs"
 * @param args 0: path to parsed document collection (use AnchorExtract); 1: (non-existing) path that will contain run resutls; 2: MIREX query file
 * @usage. //from   w  ww.j a v a  2 s  .  c om
 * <code> % hadoop jar mirex-0.2.jar nl.utwente.mirex.TrecRunBaselines /user/hadoop/ClueWeb09_Anchors/* /user/hadoop/BaselineOut /user/hadoop/wt09-topics-stats.txt </code> 
 */
public static void main(String[] args) throws Exception {
    if (args.length != 3 && args.length != 4) {
        System.out.printf("Usage: %s [inputFormat] inputFiles topicFile outputFile\n",
                TrecRun.class.getSimpleName());
        System.out.println("          inputFormat: either WARC or KEYVAL; default WARC");
        System.exit(1);
    }

    int argc = 0;
    String inputFormat = "WARC";
    if (args.length > 3) {
        inputFormat = args[argc++].toUpperCase();
    }
    String inputFiles = args[argc++];
    String outputFile = args[argc++];
    String topicFile = args[argc++];

    // Set job configuration
    Job job = new Job();
    job.setJobName("MirexBaselineRuns");
    job.setJarByClass(TrecRunBaselines.class);

    // Set intermediate output (override defaults)
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // Set output (override defaults)
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Set map-reduce classes
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    // Set input-output format
    if (inputFormat.equals("KEYVAL")) {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    } else if (inputFormat.equals("WARC")) {
        job.setInputFormatClass(WarcTextConverterInputFormat.class);
    } else {
        throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL");
    }
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set input-output paths
    FileInputFormat.setInputPaths(job, new Path(inputFiles));
    FileOutputFormat.setOutputPath(job, new Path(outputFile));

    // Set job specific distributed cache file (query file)
    DistributedCache.addCacheFile(new Path(topicFile).toUri(), job.getConfiguration());

    // Run the job
    job.waitForCompletion(true);
}

From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDay.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(CarCountPerRoadPerDay.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {/* w w  w  .j a  v a  2  s.c  o m*/
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDayIncreasedValidity.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(CarCountPerRoadPerDayIncreasedValidity.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {//from  w  w w  . j  a va 2s  .c o  m
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);
    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.ReadingsPerSensor.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJarByClass(ReadingsPerSensor.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {/*w ww. ja  va  2  s.c  o  m*/
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}