List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:nl.utwente.bigdata.shouting.ShoutingExtactor.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: exampleTwitter <in> [<in>...] <out>"); System.exit(2);// w ww . ja v a2 s.com } Job job = new Job(conf, "Extract Shouting Words"); job.setJarByClass(ShoutingExtactor.class); job.setMapperClass(MapReducers.ShoutingWordsMapper.class); job.setCombinerClass(MapReducers.CounterReducer.class); job.setReducerClass(MapReducers.CounterReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:nl.utwente.bigdata.TemplateTool.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(TemplateTool.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else {//from ww w .ja v a 2 s . c om System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files // job.setNumReduceTasks(100); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.bigdata.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(CountMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:nl.utwente.mirex.AnchorExtract.java
License:Open Source License
/** * Runs the MapReduce job "anchor text extraction" * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts * @usage. //from w w w . jav a 2 s .c o m * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/*/ /user/hadoop/ClueWeb09_Anchors </code> */ public static void main(String[] args) throws Exception { // Set job configuration Configuration conf = new Configuration(); conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout Job job = new Job(conf, "AnchorExtract"); job.setJarByClass(AnchorExtract.class); if (args.length != 2) { System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName()); System.out.println(" inputFiles: path to data"); System.out.println(" outputFile: directory where anchor text is stored"); System.exit(1); } int argc = 0; String inputFiles = args[argc++]; String outputFile = args[argc++]; job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(WarcFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list. FileOutputFormat.setOutputPath(job, new Path(outputFile)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.waitForCompletion(true); }
From source file:nl.utwente.mirex.QueryTermCount.java
License:Open Source License
/** * Configure the Hadoop job// w w w .j av a 2 s .co m * @throws IOException */ public static Job configureJob(String jobName, String format, Path inputFile, Path tempOut, Path topicFile) throws IOException, InvalidParameterException { // Set job configuration Job job = new Job(); job.setJobName(jobName); job.setJarByClass(QueryTermCount.class); // Set intermediate output (override defaults) job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); // Set output (override defaults) job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // Set map-reduce classes job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); // Set input-output format if (format.equals("KEYVAL")) { job.setInputFormatClass(KeyValueTextInputFormat.class); } else if (format.equals("WARC")) { job.setInputFormatClass(WarcTextConverterInputFormat.class); } else { throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL"); } job.setOutputFormatClass(TextOutputFormat.class); // also works withoput //conf.set("mapred.output.compress", false); job.setNumReduceTasks(1); // Set input-output paths FileInputFormat.setInputPaths(job, inputFile); FileOutputFormat.setOutputPath(job, tempOut); // Set job specific distributed cache file (query file) DistributedCache.addCacheFile(topicFile.toUri(), job.getConfiguration()); return job; }
From source file:nl.utwente.mirex.TrecRun.java
License:Open Source License
/** * Runs the MapReduce job "trec run"/* w ww. ja v a 2s . c om*/ * @param args 0: path to parsed document collection (use AnchorExtract); 1: (non-existing) path that will contain run results; 2: TREC query file * @usage. see README.html */ public static void main(String[] args) throws Exception { if (args.length != 3 && args.length != 4) { System.out.printf("Usage: %s [inputFormat] inputFiles outputFile topicFile \n", TrecRun.class.getSimpleName()); System.out.println(" inputFormat: either WARC or KEYVAL; default WARC"); System.out.println(" inputFiles: the WARC files"); System.out.println(" outputFiles: output directory"); System.out.println(" topicFile: topic descriptions (one query per line)"); System.exit(1); } int argc = 0; String inputFormat = "WARC"; if (args.length > 3) { inputFormat = args[argc++].toUpperCase(); } String inputFiles = args[argc++]; String outputFile = args[argc++]; String topicFile = args[argc++]; // Set job configuration Job job = new Job(); job.setJobName("MirexTrecRun"); job.setJarByClass(TrecRun.class); // Set intermediate output (override defaults) job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Set output (override defaults) job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Set map-reduce classes job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); // Set input-output format if (inputFormat.equals("KEYVAL")) { job.setInputFormatClass(KeyValueTextInputFormat.class); } else if (inputFormat.equals("WARC")) { job.setInputFormatClass(WarcTextConverterInputFormat.class); } else { throw new InvalidParameterException("inputFormat must be either WARC or KEYVAL"); } job.setOutputFormatClass(TextOutputFormat.class); // Set input-output paths FileInputFormat.setInputPaths(job, new Path(inputFiles)); FileOutputFormat.setOutputPath(job, new Path(outputFile)); // Set job specific distributed cache file (query file) DistributedCache.addCacheFile(new Path(topicFile).toUri(), job.getConfiguration()); // Run the job job.waitForCompletion(true); }
From source file:nl.utwente.mirex.TrecRunBaselines.java
License:Open Source License
/** * Runs the MapReduce job "trec baseline runs" * @param args 0: path to parsed document collection (use AnchorExtract); 1: (non-existing) path that will contain run resutls; 2: MIREX query file * @usage. //from w ww.j a v a 2 s . c om * <code> % hadoop jar mirex-0.2.jar nl.utwente.mirex.TrecRunBaselines /user/hadoop/ClueWeb09_Anchors/* /user/hadoop/BaselineOut /user/hadoop/wt09-topics-stats.txt </code> */ public static void main(String[] args) throws Exception { if (args.length != 3 && args.length != 4) { System.out.printf("Usage: %s [inputFormat] inputFiles topicFile outputFile\n", TrecRun.class.getSimpleName()); System.out.println(" inputFormat: either WARC or KEYVAL; default WARC"); System.exit(1); } int argc = 0; String inputFormat = "WARC"; if (args.length > 3) { inputFormat = args[argc++].toUpperCase(); } String inputFiles = args[argc++]; String outputFile = args[argc++]; String topicFile = args[argc++]; // Set job configuration Job job = new Job(); job.setJobName("MirexBaselineRuns"); job.setJarByClass(TrecRunBaselines.class); // Set intermediate output (override defaults) job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // Set output (override defaults) job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Set map-reduce classes job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); // Set input-output format if (inputFormat.equals("KEYVAL")) { job.setInputFormatClass(KeyValueTextInputFormat.class); } else if (inputFormat.equals("WARC")) { job.setInputFormatClass(WarcTextConverterInputFormat.class); } else { throw new InvalidParameterException("inputFormat must bei either WARC or KEYVAL"); } job.setOutputFormatClass(TextOutputFormat.class); // Set input-output paths FileInputFormat.setInputPaths(job, new Path(inputFiles)); FileOutputFormat.setOutputPath(job, new Path(outputFile)); // Set job specific distributed cache file (query file) DistributedCache.addCacheFile(new Path(topicFile).toUri(), job.getConfiguration()); // Run the job job.waitForCompletion(true); }
From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDay.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(CarCountPerRoadPerDay.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else {/* w w w .j a v a 2 s.c o m*/ System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files job.setNumReduceTasks(1); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDayIncreasedValidity.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(CarCountPerRoadPerDayIncreasedValidity.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else {//from w w w . j a va 2s .c o m System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files job.setNumReduceTasks(1); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.trafficanalyzer.ReadingsPerSensor.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(ReadingsPerSensor.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else {/*w ww. ja va 2 s.c o m*/ System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files job.setNumReduceTasks(1); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }