List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.pagerankcalculator.TwitterPageRank.java
public int calculatePagerank(String in, String out, int iteration) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#2 Iteration-" + iteration + " Calculating Page Rank"); job.setJarByClass(TwitterPageRank.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(PageRankCalculationMapper.class); job.setReducerClass(PageRankCalculationReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); Path inputFilePath = new Path(in); Path outputFilePath = new Path(out); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, outputFilePath); FileSystem fs = FileSystem.newInstance(getConf()); if (fs.exists(outputFilePath)) { fs.delete(outputFilePath, true); }/*w ww . j a va 2 s .c om*/ return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.pagerankcalculator.TwitterPageRank.java
public int sortPagerank(String in, String out) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(getConf()); job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#3 Sorting Page Rank"); job.setJarByClass(TwitterPageRank.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(PageRankSortingMapper.class); job.setReducerClass(PageRankSortingReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setNumReduceTasks(1); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); job.setSortComparatorClass(DoubleSortDescComparator.class); Path inputFilePath = new Path(in); Path outputFilePath = new Path(out); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, outputFilePath); FileSystem fs = FileSystem.newInstance(getConf()); if (fs.exists(outputFilePath)) { fs.delete(outputFilePath, true); }/* w w w. j a va 2s. c om*/ return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.peer2gear.nutch.xquery.ParseResult.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("Usage: %s [generic options] (<segment> ... | -dir <segments>) <output>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/* w w w . j av a2 s. c o m*/ Job job = new Job(getConf()); for (int i = 0; i < args.length - 1; i++) { if ("-dir".equals(args[i])) { Path dir = new Path(args[++i]); FileSystem fs = dir.getFileSystem(getConf()); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] segments = HadoopFSUtil.getPaths(fstats); for (Path segment : segments) { FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); } } else { FileInputFormat.addInputPath(job, new Path(args[i], ParseData.DIR_NAME)); } } FileOutputFormat.setOutputPath(job, new Path(args[args.length - 1])); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(GetResultMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.phantom.hadoop.examples.BaileyBorweinPlouffe.java
License:Apache License
/** Create and setup a job */ private static Job createJob(String name, Configuration conf) throws IOException { final Job job = new Job(conf, NAME + "_" + name); final Configuration jobconf = job.getConfiguration(); job.setJarByClass(BaileyBorweinPlouffe.class); // setup mapper job.setMapperClass(BbpMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BytesWritable.class); // setup reducer job.setReducerClass(BbpReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BytesWritable.class); job.setNumReduceTasks(1); // setup input job.setInputFormatClass(BbpInputFormat.class); // disable task timeout jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0); // do not use speculative execution jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false); jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false); return job;// ww w .j a v a 2 s. co m }
From source file:com.phantom.hadoop.examples.dancing.DistributedPentomino.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); if (args.length == 0) { System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]"); ToolRunner.printGenericCommandUsage(System.out); return 2; }/* w w w. j a v a2 s . c o m*/ // check for passed parameters, otherwise use defaults int width = conf.getInt(Pentomino.WIDTH, PENT_WIDTH); int height = conf.getInt(Pentomino.HEIGHT, PENT_HEIGHT); int depth = conf.getInt(Pentomino.DEPTH, PENT_DEPTH); for (int i = 0; i < args.length; i++) { if (args[i].equalsIgnoreCase("-depth")) { depth = Integer.parseInt(args[++i].trim()); } else if (args[i].equalsIgnoreCase("-height")) { height = Integer.parseInt(args[++i].trim()); } else if (args[i].equalsIgnoreCase("-width")) { width = Integer.parseInt(args[++i].trim()); } } // now set the values within conf for M/R tasks to read, this // will ensure values are set preventing MAPREDUCE-4678 conf.setInt(Pentomino.WIDTH, width); conf.setInt(Pentomino.HEIGHT, height); conf.setInt(Pentomino.DEPTH, depth); Class<? extends Pentomino> pentClass = conf.getClass(Pentomino.CLASS, OneSidedPentomino.class, Pentomino.class); int numMaps = conf.getInt(MRJobConfig.NUM_MAPS, DEFAULT_MAPS); Path output = new Path(args[0]); Path input = new Path(output + "_input"); FileSystem fileSys = FileSystem.get(conf); try { Job job = new Job(conf); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(PentMap.class); job.setJobName("dancingElephant"); Pentomino pent = ReflectionUtils.newInstance(pentClass, conf); pent.initialize(width, height); long inputSize = createInputDirectory(fileSys, input, pent, depth); // for forcing the number of maps FileInputFormat.setMaxInputSplitSize(job, (inputSize / numMaps)); // the keys are the prefix strings job.setOutputKeyClass(Text.class); // the values are puzzle solutions job.setOutputValueClass(Text.class); job.setMapperClass(PentMap.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); return (job.waitForCompletion(true) ? 0 : 1); } finally { fileSys.delete(input, true); } }
From source file:com.phantom.hadoop.examples.Grep.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return 2; }//from www.j ava2 s . com Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = new Job(conf); try { grepJob.setJobName("grep-search"); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = new Job(conf); sortJob.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
From source file:com.phantom.hadoop.examples.Join.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./*from w w w . ja v a 2 s .c o m*/ * * @throws IOException * When there is communication problems with the job tracker. */ @SuppressWarnings("unchecked") public int run(String[] args) throws Exception { Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String join_reduces = conf.get(REDUCES_PER_HOST); if (join_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(join_reduces); } Job job = new Job(conf); job.setJobName("join"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = TupleWritable.class; String op = "inner"; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-joinOp".equals(args[i])) { op = args[++i]; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs job.setNumReduceTasks(num_reduces); if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } FileOutputFormat.setOutputPath(job, new Path(otherArgs.remove(otherArgs.size() - 1))); List<Path> plist = new ArrayList<Path>(otherArgs.size()); for (String s : otherArgs) { plist.add(new Path(s)); } job.setInputFormatClass(CompositeInputFormat.class); job.getConfiguration().set(CompositeInputFormat.JOIN_EXPR, CompositeInputFormat.compose(op, inputFormatClass, plist.toArray(new Path[0]))); job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:com.phantom.hadoop.examples.QuasiMonteCarlo.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi//ww w . j a v a2s . com */ public static BigDecimal estimatePi(int numMaps, long numPoints, Path tmpDir, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(conf); // setup job conf job.setJobName(QuasiMonteCarlo.class.getSimpleName()); job.setJarByClass(QuasiMonteCarlo.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(LongWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(QmcMapper.class); job.setReducerClass(QmcReducer.class); job.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. job.setSpeculativeExecution(false); // setup input/output directories final Path inDir = new Path(tmpDir, "in"); final Path outDir = new Path(tmpDir, "out"); FileInputFormat.setInputPaths(job, inDir); FileOutputFormat.setOutputPath(job, outDir); final FileSystem fs = FileSystem.get(conf); if (fs.exists(tmpDir)) { throw new IOException( "Tmp directory " + fs.makeQualified(tmpDir) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { // generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } // start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); job.waitForCompletion(true); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + duration + " seconds"); // read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf); try { reader.next(numInside, numOutside); } finally { reader.close(); } // compute estimated value final BigDecimal numTotal = BigDecimal.valueOf(numMaps).multiply(BigDecimal.valueOf(numPoints)); return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())).divide(numTotal, RoundingMode.HALF_UP); } finally { fs.delete(tmpDir, true); } }
From source file:com.phantom.hadoop.examples.RandomTextWriter.java
License:Apache License
/** * This is the main routine for launching a distributed random write job. It * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The * reduce doesn't do anything.// www. ja v a 2s . co m * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { return printUsage(); } Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10); long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have " + BYTES_PER_MAP + " set to 0"); return -2; } long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.setLong(BYTES_PER_MAP, totalBytesToWrite); } conf.setInt(MRJobConfig.NUM_MAPS, numMaps); Job job = new Job(conf); job.setJarByClass(RandomTextWriter.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RandomWriter.RandomInputFormat.class); job.setMapperClass(RandomTextMapper.class); Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else { otherArgs.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0))); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:com.phantom.hadoop.examples.RandomWriter.java
License:Apache License
/** * This is the main routine for launching a distributed random write job. It * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The * reduce doesn't do anything./* w ww . jav a 2s. c o m*/ * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return 2; } Path outDir = new Path(args[0]); Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10); long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0"); return -2; } long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.setLong(BYTES_PER_MAP, totalBytesToWrite); } conf.setInt(MRJobConfig.NUM_MAPS, numMaps); Job job = new Job(conf); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(RandomInputFormat.class); job.setMapperClass(RandomMapper.class); job.setReducerClass(Reducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }