List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:com.trace.hadoop.examples.Grep.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return -1; }/*from w w w.j a va2 s. co m*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf grepJob = new JobConf(getConf(), Grep.class); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.set("mapred.mapper.regex", args[2]); if (args.length == 4) grepJob.set("mapred.mapper.regex.group", args[3]); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormat(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); JobClient.runJob(grepJob); JobConf sortJob = new JobConf(getConf(), Grep.class); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormat(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setOutputKeyComparatorClass // sort by decreasing freq (LongWritable.DecreasingComparator.class); JobClient.runJob(sortJob); } finally { FileSystem.get(grepJob).delete(tempDir, true); } return 0; }
From source file:com.unstruct.demo.WordCount.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//from www. ja v a2 s . co m */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WordCount.class); conf.setJobName("wordcount"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Extracts redirects and the target for each. * * @param inputPath/*from w w w . j av a 2s . co m*/ * @param outputPath * @throws IOException */ private void task0(String inputPath, String outputPath) throws IOException { LOG.info("Extracting redirects (phase 0)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper0.class); conf.setReducerClass(IdentityReducer.class); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Maps from Wikipedia article to (srcID, (targetID, anchor). * * @param inputPath// w w w .j a v a 2s. co m * @param outputPath * @throws IOException */ private void task1(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. conf.setNumReduceTasks(10); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfStringInt.class); conf.setMapOutputValueClass(PairOfStrings.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfStrings.class); conf.setMapperClass(MyMapper1.class); conf.setReducerClass(MyReducer1.class); conf.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)). * * @param inputPath//from w ww . j a v a2 s. c om * @param outputPath * @throws IOException */ private void task2(String inputPath, String outputPath, String redirPath) throws IOException { LOG.info("Extracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Random r = new Random(); //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000); //LOG.info( "intermediate folder for merge " + tmpOutput ); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. conf.setNumReduceTasks(1); try { DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf); DistributedCache.createSymlink(conf); } catch (URISyntaxException e) { e.printStackTrace(); } FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(HMapSIW.class); conf.setMapperClass(MyMapper2.class); conf.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); // Clean up intermediate data. FileSystem.get(conf).delete(new Path(inputPath), true); /* //merge String finalO = outputPath+"/part-00000/data"; FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") ); getMergeInHdfs( tmpOutput, finalO, conf ); FileSystem.get(conf).delete(new Path(tmpOutput), true); */ }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Extracts CF for each found anchor.// w ww .j a va2 s . c o m * * @param inputPath * @param mapPath * @param outputPath * @throws IOException */ private void task3(String inputPath, String mapPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 3)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); LOG.info(" - mapping: " + mapPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); String location = "map.dat"; try { DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf); //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf); DistributedCache.createSymlink(conf); } catch (URISyntaxException e) { e.printStackTrace(); } FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper3.class); conf.setCombinerClass(MyReducer3.class); conf.setReducerClass(MyReducer3.class); JobClient.runJob(conf); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
/** * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)). * * @param inputPath//from w w w . j a va 2 s . co m * @param outputPath * @throws IOException */ private void task4(String inputPath, String outputPath) throws IOException { LOG.info("Extracting anchor text (phase 4)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName( String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath)); conf.setNumReduceTasks(1); //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data")); FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data")); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(HMapSIW.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(HMapSIW.class); conf.setMapperClass(MyMapper4.class); conf.setReducerClass(MyReducer4.class); JobClient.runJob(conf); }
From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVInputFormatTest.java
License:Open Source License
public int run(String[] args) throws Exception { getConf().set(CSVLineRecordReader.FORMAT_DELIMITER, "\""); getConf().set(CSVLineRecordReader.FORMAT_SEPARATOR, ","); getConf().setInt(CSVNLineInputFormat.LINES_PER_MAP, 40000); getConf().setBoolean(CSVLineRecordReader.IS_ZIPFILE, false); JobConf jobConf = new JobConf(getConf()); jobConf.setJarByClass(CSVTestRunner.class); jobConf.setNumReduceTasks(0); jobConf.setMapperClass(TestMapper.class); jobConf.setInputFormat(CSVNLineInputFormat.class); jobConf.setOutputKeyClass(NullWritable.class); jobConf.setOutputValueClass(Text.class); CSVNLineInputFormat.setInputPaths(jobConf, new Path(args[0])); TextOutputFormat.setOutputPath(jobConf, new Path(args[1])); JobClient.runJob(jobConf);//w w w .j av a2s . c o m return 0; }
From source file:com.yolodata.tbana.hadoop.mapred.shuttl.TestMapper.java
License:Open Source License
public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(TestConfigurations.getConfigurationWithShuttlSearch()); jobConf.setJarByClass(ShuttlTestJob.class); jobConf.setNumReduceTasks(1); jobConf.setMapperClass(TestMapper.class); jobConf.setReducerClass(TestReducer.class); jobConf.setInputFormat(ShuttlCSVInputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(Text.class); ShuttlCSVInputFormat.addInputPath(jobConf, new Path(args[0])); TextOutputFormat.setOutputPath(jobConf, new Path(args[1])); JobClient.runJob(jobConf);/*from w w w . ja va 2s.co m*/ return 0; }
From source file:com.yolodata.tbana.hadoop.mapred.splunk.inputformat.TestMapper.java
License:Open Source License
public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf()); jobConf.set(SplunkInputFormat.INPUTFORMAT_MODE, args[0]); jobConf.setJarByClass(SplunkTestRunner.class); jobConf.setNumReduceTasks(1); jobConf.setMapperClass(TestMapper.class); jobConf.setReducerClass(TestReducer.class); jobConf.setInputFormat(SplunkInputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(Text.class); TextOutputFormat.setOutputPath(jobConf, new Path(args[1])); JobClient.runJob(jobConf);// ww w. j a v a2 s. c om return 0; }