List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:edu.umd.honghongie.BuildInvertedIndexCompressed.java
License:Apache License
/** * Runs this tool./*w ww . j av a 2 s . co m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool name: " + BuildInvertedIndexCompressed.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - num reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(BuildInvertedIndexCompressed.class.getSimpleName()); job.setJarByClass(BuildInvertedIndexCompressed.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStringLong.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfWritables.class); job.setOutputFormatClass(MapFileOutputFormat.class); //why mapfileoutputformat? // job.setOutputFormatClass(SequenceFileOutputFormat); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.honghongie.PairsPMI.java
License:Apache License
/** * Runs this tool./*from w w w.ja v a2 s .c om*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); // options.addOption(OptionBuilder.withArgName("num").hasArg() // .withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; // int window = cmdline.hasOption(WINDOW) ? // Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + PairsPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); // LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); //JobConf conf = new JobConf(PairsPMI.class); // first job //Job job1 = new Job (conf,"join1"); Configuration conf1 = getConf(); Job job1 = Job.getInstance(conf1); job1.setJobName(PairsPMI.class.getSimpleName()); job1.setJarByClass(PairsPMI.class); job1.setNumReduceTasks(1); //ensure go to one file //file path of job1 // Delete the output directory if it exist Path dir = new Path("temp"); FileSystem.get(getConf()).delete(dir, true); FileInputFormat.setInputPaths(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path("temp")); job1.setMapperClass(Map_First.class); job1.setCombinerClass(MyCombiner.class); job1.setReducerClass(Reduce_First.class); job1.setMapOutputKeyClass(Text.class);//map output key job1.setMapOutputValueClass(IntWritable.class);//map output value job1.setOutputKeyClass(Text.class);//reduce output key job1.setOutputValueClass(IntWritable.class);//reduce output value // ControlledJob ctrljob1=new ControlledJob(conf); // ctrljob1.setJob(job1); long startTime1 = System.currentTimeMillis(); job1.waitForCompletion(true); System.out.println( "First Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); //begin job2 //Configuration conf2 = getConf(); Job job2 = Job.getInstance(getConf()); job2.setJobName(PairsPMI.class.getSimpleName()); job2.setJarByClass(PairsPMI.class); job2.setNumReduceTasks(reduceTasks); //delete the output directory if it exists. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); //file path of job2 FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.addCacheFile(new URI("temp/part-r-00000")); job2.setMapperClass(Map_Second.class); job2.setCombinerClass(MyCombiner_Second.class); job2.setReducerClass(Reduce_Second.class); job2.setMapOutputKeyClass(PairOfStrings.class);//map output key job2.setMapOutputValueClass(FloatWritable.class);//map output value job2.setOutputKeyClass(PairOfStrings.class);//reduce output key job2.setOutputValueClass(FloatWritable.class);//reduce output value long startTime2 = System.currentTimeMillis(); job2.waitForCompletion(true); System.out.println( "Second Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds"); System.out.println( "Total Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); System.out.println("Total number of lines:" + lines); return 0; }
From source file:edu.umd.honghongie.RunPersonalizedPageRankBasic.java
License:Apache License
private ArrayListOfFloats phase1(int i, int j, String basePath, int numNodes, ArrayListOfInts sourceids, boolean useCombiner) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); job.setJarByClass(RunPersonalizedPageRankBasic.class); String in = basePath + "/iter" + formatter.format(i); String out = basePath + "/iter" + formatter.format(j) + "t"; String outm = out + "-mass"; // We need to actually count the number of part files to get the number of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;// www .java 2 s . c o m } LOG.info("PageRank: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + numNodes); LOG.info(" - useCombiner: " + useCombiner); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; job.getConfiguration().setInt("NodeCount", numNodes); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.getConfiguration().set("PageRankMassPath", outm); //*********************** reduer uses sourcenode job.getConfiguration().set("SourceNode", sourceids.toString()); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MapClass.class); if (useCombiner) { job.setCombinerClass(CombineClass.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(getConf()).delete(new Path(out), true); FileSystem.get(getConf()).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); System.out.println("********** 1 *********"); ArrayListOfFloats mass = new ArrayListOfFloats(); int length = sourceids.size(); System.out.println("*********** 1 **********" + length); float test = Float.NEGATIVE_INFINITY; for (int k = 0; k < length; k++) { mass.add(Float.NEGATIVE_INFINITY); //use add to initialize } System.out.println("********** test ********" + test); System.out.println("******** 1 ********" + mass); //****************************************** how to resolve datastream FileSystem fs = FileSystem.get(getConf()); ArrayListOfFloatsWritable invalue = new ArrayListOfFloatsWritable(); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); //************************************** get all values from fin? invalue.readFields(fin); System.out.println("************** 1 ************" + invalue); for (int k = 0; k < invalue.size(); k++) { mass.set(k, sumLogProbs(mass.get(k), invalue.get(k))); } fin.close(); } System.out.println("******** 1 ********" + mass.toString()); return mass; }
From source file:edu.umd.honghongie.StripesPMI.java
License:Apache License
/** * Runs this tool./*from w ww . jav a 2 s . c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); // options.addOption(OptionBuilder.withArgName("num").hasArg() // .withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; // int window = cmdline.hasOption(WINDOW) ? // Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + StripesPMI.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); // LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); //JobConf conf = new JobConf(PairsPMI.class); // first job //Job job1 = new Job (conf,"join1"); Configuration conf1 = getConf(); Job job1 = Job.getInstance(conf1); job1.setJobName(StripesPMI.class.getSimpleName()); job1.setJarByClass(StripesPMI.class); job1.setNumReduceTasks(1); //file path of job1 // Delete the output directory if it exist Path dir = new Path("temp"); FileSystem.get(getConf()).delete(dir, true); FileInputFormat.setInputPaths(job1, new Path(inputPath)); FileOutputFormat.setOutputPath(job1, new Path("temp")); job1.setMapperClass(Map_First.class); job1.setCombinerClass(MyCombiner.class); job1.setReducerClass(Reduce_First.class); job1.setMapOutputKeyClass(Text.class);//map output key job1.setMapOutputValueClass(IntWritable.class);//map output value job1.setOutputKeyClass(Text.class);//reduce output key job1.setOutputValueClass(IntWritable.class);//reduce output value // ControlledJob ctrljob1=new ControlledJob(conf); // ctrljob1.setJob(job1); long startTime1 = System.currentTimeMillis(); job1.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); //begin job2 //Configuration conf2 = getConf(); Job job2 = Job.getInstance(getConf()); job2.setJobName(StripesPMI.class.getSimpleName()); job2.setJarByClass(StripesPMI.class); job2.setNumReduceTasks(reduceTasks); //delete the output directory if it exists. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); //file path of job2 FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.addCacheFile(new URI("temp/part-r-00000")); job2.setMapperClass(Map_Second.class); job2.setReducerClass(Reduce_Second.class); job2.setMapOutputKeyClass(Text.class);//map output key job2.setMapOutputValueClass(HMapStIW.class);//map output value job2.setOutputKeyClass(PairOfStrings.class);//reduce output key job2.setOutputValueClass(FloatWritable.class);//reduce output value long startTime2 = System.currentTimeMillis(); job2.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime2) / 1000.0 + " seconds"); System.out .println("Total Job Finished in" + (System.currentTimeMillis() - startTime1) / 1000.0 + " seconds"); System.out.println("total number of lines:" + lines); return 0; }
From source file:edu.umd.shrawanraina.RunPageRankBasic.java
License:Apache License
private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner, boolean useInMapperCombiner) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankBasic.class); String in = basePath + "/iter" + formatter.format(i); String out = basePath + "/iter" + formatter.format(j) + "t"; String outm = out + "-mass"; // We need to actually count the number of part files to get the number // of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;// w ww . ja v a 2 s . c o m } LOG.info("PageRank: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + numNodes); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInMapperCombiner); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; job.getConfiguration().setInt("NodeCount", numNodes); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.getConfiguration().set("PageRankMassPath", outm); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class); if (useCombiner) { job.setCombinerClass(CombineClass.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(getConf()).delete(new Path(out), true); FileSystem.get(getConf()).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; FileSystem fs = FileSystem.get(getConf()); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:edu.umd.windmemory.PMIPairs.java
License:Apache License
/** * Runs this tool.//w ww . j a v a2 s. com */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", "temp"); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(PairOfStrings.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.windmemory.PMIStripes.java
License:Apache License
/** * Runs this tool.//from w ww . j a va 2 s.com */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + PMIPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(PMIPairs.class.getSimpleName()); job.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path interDir = new Path("temp"); FileSystem.get(getConf()).delete(interDir, true); // job.setNumMapTasks(reduceTasks); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, interDir); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyFirstMapper.class); job.setCombinerClass(MyFirstReducer.class); job.setReducerClass(MyFirstReducer.class); job.setPartitionerClass(MyFirstPartitioner.class); Job job2 = Job.getInstance(getConf()); job2.setJobName(PMIPairs.class.getSimpleName()); job2.setJarByClass(PMIPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); // job2.getConfiguration().set("path", interDir.toString()); // job2.getConfiguration().setInt("num", reduceTasks); job2.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job2, new Path(inputPath)); FileOutputFormat.setOutputPath(job2, new Path(outputPath)); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(HMapStIW.class); job2.setOutputKeyClass(PairOfStrings.class); job2.setOutputValueClass(DoubleWritable.class); job2.setMapperClass(MySecondMapper.class); job2.setCombinerClass(MySecondCombiner.class); job2.setReducerClass(MySecondReducer.class); job2.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job2.addCacheFile(new URI("temp/part-r-00000")); job.waitForCompletion(true); job2.waitForCompletion(true); // FileSystem.get(getConf()).delete(interDir, true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java
License:Open Source License
/** * Generates a single level using a MapReduce job and returns the created job. * @param inFiles//from ww w . j a v a 2 s. c o m * @param outFile * @param plotterClass * @param params * @return * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static Job plotMapReduce(Path[] inFiles, Path outFile, Class<? extends Plotter> plotterClass, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { Plotter plotter; try { plotter = plotterClass.newInstance(); } catch (InstantiationException e) { throw new RuntimeException("Error creating rastierizer", e); } catch (IllegalAccessException e) { throw new RuntimeException("Error creating rastierizer", e); } Job job = new Job(params, "SingleLevelPlot"); job.setJarByClass(SingleLevelPlot.class); job.setJobName("SingleLevelPlot"); // Set plotter Configuration conf = job.getConfiguration(); Plotter.setPlotter(conf, plotterClass); // Set input file MBR Rectangle inputMBR = (Rectangle) params.getShape("mbr"); Rectangle drawRect = (Rectangle) params.getShape("rect"); if (inputMBR == null) inputMBR = drawRect != null ? drawRect : FileMBR.fileMBR(inFiles, params); OperationsParams.setShape(conf, InputMBR, inputMBR); if (drawRect != null) OperationsParams.setShape(conf, SpatialInputFormat3.InputQueryRange, drawRect); // Adjust width and height if aspect ratio is to be kept int imageWidth = conf.getInt("width", 1000); int imageHeight = conf.getInt("height", 1000); if (params.getBoolean("keepratio", true)) { // Adjust width and height to maintain aspect ratio if (inputMBR.getWidth() / inputMBR.getHeight() > (double) imageWidth / imageHeight) { // Fix width and change height imageHeight = (int) (inputMBR.getHeight() * imageWidth / inputMBR.getWidth()); // Make divisible by two for compatibility with ffmpeg if (imageHeight % 2 == 1) imageHeight--; conf.setInt("height", imageHeight); } else { imageWidth = (int) (inputMBR.getWidth() * imageHeight / inputMBR.getHeight()); conf.setInt("width", imageWidth); } } boolean merge = conf.getBoolean("merge", true); // Set input and output job.setInputFormatClass(SpatialInputFormat3.class); SpatialInputFormat3.setInputPaths(job, inFiles); if (conf.getBoolean("output", true)) { if (merge) { job.setOutputFormatClass(CanvasOutputFormat.class); conf.setClass("mapred.output.committer.class", CanvasOutputFormat.ImageWriterOld.class, org.apache.hadoop.mapred.OutputCommitter.class); } else { job.setOutputFormatClass(ImageOutputFormat.class); } CanvasOutputFormat.setOutputPath(job, outFile); } else { job.setOutputFormatClass(NullOutputFormat.class); } // Set mapper and reducer based on the partitioning scheme String partition = conf.get("partition", "none"); ClusterStatus clusterStatus = new JobClient(new JobConf()).getClusterStatus(); if (partition.equalsIgnoreCase("none")) { LOG.info("Using no-partition plot"); job.setMapperClass(NoPartitionPlotMap.class); job.setCombinerClass(NoPartitionPlotCombine.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(plotter.getCanvasClass()); if (merge) { int numSplits = new SpatialInputFormat3().getSplits(job).size(); job.setReducerClass(NoPartitionPlotReduce.class); // Set number of reduce tasks according to cluster status int maxReduce = Math.max(1, clusterStatus.getMaxReduceTasks() * 7 / 8); job.setNumReduceTasks(Math.max(1, Math.min(maxReduce, numSplits / maxReduce))); } else { job.setNumReduceTasks(0); } } else { LOG.info("Using repartition plot"); Partitioner partitioner; if (partition.equals("pixel")) { // Special case for pixel level partitioning as it depends on the // visualization parameters partitioner = new GridPartitioner(inputMBR, imageWidth, imageHeight); } else if (partition.equals("grid")) { int numBlocks = 0; for (Path in : inFiles) { FileSystem fs = in.getFileSystem(params); long size = FileUtil.getPathSize(fs, in); long blockSize = fs.getDefaultBlockSize(in); numBlocks += Math.ceil(size / (double) blockSize); } int numPartitions = numBlocks * 1000; int gridSize = (int) Math.ceil(Math.sqrt(numPartitions)); partitioner = new GridPartitioner(inputMBR, gridSize, gridSize); } else { // Use a standard partitioner as created by the indexer partitioner = Indexer.createPartitioner(inFiles, outFile, conf, partition); } Shape shape = params.getShape("shape"); job.setMapperClass(RepartitionPlotMap.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(shape.getClass()); job.setReducerClass(RepartitionPlotReduce.class); // Set number of reducers according to cluster size job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks() * 9 / 10)); Partitioner.setPartitioner(conf, partitioner); } // Use multithreading in case the job is running locally conf.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors()); // Start the job if (params.getBoolean("background", false)) { // Run in background job.submit(); } else { job.waitForCompletion(params.getBoolean("verbose", false)); } return job; }
From source file:eu.scape_project.tb.wc.archd.hadoop.HadoopArcReaderJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser gop = new GenericOptionsParser(conf, args); HadoopJobCliConfig pc = new HadoopJobCliConfig(); CommandLineParser cmdParser = new PosixParser(); CommandLine cmd = cmdParser.parse(HadoopJobOptions.OPTIONS, gop.getRemainingArgs()); if ((args.length == 0) || (cmd.hasOption(HadoopJobOptions.HELP_OPT))) { HadoopJobOptions.exit("Usage", 0); } else {/*from ww w .ja v a 2 s .com*/ HadoopJobOptions.initOptions(cmd, pc); } String dir = pc.getDirStr(); String name = pc.getHadoopJobName(); if (name == null || name.equals("")) { name = "webarc_reader"; // default job name } Job job = new Job(conf); //********************************************************** // for debugging in local mode // comment out the 2 lines below befor switching to pseudo-distributed or fully-distributed mode // job.getConfiguration().set("mapred.job.tracker", "local"); // job.getConfiguration().set("fs.default.name", "local"); //********************************************************** FileInputFormat.setInputPaths(job, new Path(dir)); String outpath = "output/" + System.currentTimeMillis() + "wcr"; logger.info("Output directory: " + outpath); FileOutputFormat.setOutputPath(job, new Path(outpath)); job.setJarByClass(HadoopArcReaderJob.class); job.setJobName(name); //*** Set interface data types // We are using LONG because this value can become very large on huge archives. // In order to use the combiner function, also the map output needs to be a LONG. //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //*** Set up the mapper, combiner and reducer job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); //*** Set the MAP output compression //job.getConfiguration().set("mapred.compress.map.output", "true"); //*** Set input / output format job.setInputFormatClass(ArcInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //*** Start the job and wait for it boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:eu.scape_project.tb.wc.archd.mapreduce.FileCharacterisation.java
License:Apache License
public int run(String[] args) throws Exception { Job job = null;//Job.getInstance(getConf()); System.out.println(getConf().get("mapreduce.job.user.classpath.first")); for (int i = 0; i < args.length; i++) { System.out.println("Arg" + i + ": " + args[i]); }/* www.ja v a 2 s .c om*/ FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(FileCharacterisation.class); job.setJobName(name); //*** Set interface data types // We are using LONG because this value can become very large on huge archives. // In order to use the combiner function, also the map output needs to be a LONG. //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //*** Set up the mapper, combiner and reducer job.setMapperClass(TikaMap.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); //*** Set the MAP output compression //job.getConfiguration().set("mapred.compress.map.output", "true"); //*** Set input / output format job.setInputFormatClass(ArcInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //*** Start the job and wait for it boolean success = job.waitForCompletion(true); return success ? 0 : 1; }