List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:at.illecker.hama.rootbeer.examples.util.RandomGraphGenerator.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 4) { System.out.println(/*from www . ja va 2s . c o m*/ "USAGE: <Number of vertices> <Number of edges per vertex> <Number of partitions> <Outpath>"); return; } System.out.println(Arrays.toString(args)); Configuration conf = new Configuration(); conf.setInt("hama.num.vertices", Integer.parseInt(args[0])); conf.setInt("hama.num.partitions", Integer.parseInt(args[2])); conf.setInt("number.edges", Integer.parseInt(args[1])); Job job = new Job(conf); Path generated = new Path(new Path(args[3]).getParent(), "generated"); FileOutputFormat.setOutputPath(job, generated); FileSystem.get(conf).delete(generated, true); job.setJobName("RangeWriter"); job.setJarByClass(SortGenMapper.class); job.setMapperClass(SortGenMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(RangeInputFormat.class); job.waitForCompletion(true); conf.setInt("max.id", Integer.valueOf(args[0])); job = new Job(conf); FileOutputFormat.setOutputPath(job, new Path(args[3])); FileSystem.get(conf).delete(new Path(args[3]), true); job.setJobName("Random Vertex Writer"); FileInputFormat.addInputPath(job, generated); job.setJarByClass(RandomMapper.class); job.setMapperClass(RandomMapper.class); job.setReducerClass(Reducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(conf.getInt("hama.num.partitions", 2)); job.setPartitionerClass(HashPartitioner.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); }
From source file:averagerating_youtube.AverageRating_Youtube.java
/** * @param args the command line arguments */// w ww . j a v a 2 s. c om @Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJobName("AverageRating_Youtube"); job.setJarByClass(AverageRating_Youtube.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(AvgRating_CommCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(AverageRating_CommentCountTuple.class); job.setCombinerClass(AvgRating_CommCountCombiner.class); job.setReducerClass(AvgRating_CommCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(AverageRating_CommentCountTuple.class); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:bdss.cmu.edu.Sort.java
License:Apache License
/** * The main driver for sort program.//from w w w . ja v a 2 s .co m * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs job = new Job(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(num_reduces); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(job, otherArgs.get(0)); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); InputSampler.<K, V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, conf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:be.uantwerpen.adrem.hadoop.util.Tools.java
License:Apache License
@SuppressWarnings("rawtypes") public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = new Job(new Configuration()); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }//from www. jav a 2s .c o m job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); if (mapperKey != null) { job.setMapOutputKeyClass(mapperKey); } if (mapperValue != null) { job.setMapOutputValueClass(mapperValue); } jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:boa.runtime.BoaRunner.java
License:Apache License
/** * Create a {@link Job} describing the work to be done by this Boa job. * //from w ww . ja v a 2 s.c o m * @param ins * An array of {@link Path} containing the locations of the input * files * * @param out * A {@link Path} containing the location of the output file * * @param robust * A boolean representing whether the job should ignore most * exceptions * * @return A {@link Job} describing the work to be done by this Boa job * @throws IOException */ public Job job(final Path[] ins, final Path out, final boolean robust) throws IOException { final Configuration configuration = getConf(); configuration.setBoolean("boa.runtime.robust", robust); // faster local reads configuration.setBoolean("dfs.client.read.shortcircuit", true); configuration.setBoolean("dfs.client.read.shortcircuit.skip.checksum", true); // by default our MapFile's index every key, which takes up // a lot of memory - this lets you skip keys in the index and // control the memory requirements (as a tradeoff of slower gets) //configuration.setLong("io.map.index.skip", 128); // map output compression configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.map.output.compression.type", "BLOCK"); configuration.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class); configuration.setBoolean("mapred.map.tasks.speculative.execution", false); configuration.setBoolean("mapred.reduce.tasks.speculative.execution", false); configuration.setLong("mapred.job.reuse.jvm.num.tasks", -1); final Job job = new Job(configuration); if (ins != null) for (final Path in : ins) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); job.setPartitionerClass(BoaPartitioner.class); job.setMapOutputKeyClass(EmitKey.class); job.setMapOutputValueClass(EmitValue.class); job.setOutputFormatClass(BoaOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); return job; }
From source file:boostingPL.driver.AdaBoostPLDriver.java
License:Open Source License
@Override public int run(String[] args) throws Exception { int status = commandAnalysis(args); if (status != 0) { return status; }/*from w w w . jav a 2s . c o m*/ @SuppressWarnings("deprecation") Job job = new Job(getConf()); job.setJobName("AdaBoostPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " " + numLinesPerMap + " " + numIterations); job.setJarByClass(AdaBoostPLDriver.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dataPath); NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap); if (runModel.equals("train")) { job.setMapperClass(AdaBoostPLMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ClassifierWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(ClassifierWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, modelPath); } else { job.setMapperClass(AdaBoostPLTestMapper.class); job.setReducerClass(AdaBoostPLTestReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); } Configuration conf = job.getConfiguration(); conf.set("BoostingPL.boostingName", "AdaBoost"); conf.set("BoostingPL.numIterations", String.valueOf(numIterations)); conf.set("BoostingPL.modelPath", modelPath.toString()); if (metadataPath == null) { conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata"); } else { conf.set("BoostingPL.metadata", metadataPath.toString()); } if (outputFolder != null) { conf.set("BoostingPL.outputFolder", outputFolder.toString()); } LOG.info(StringUtils.arrayToString(args)); return job.waitForCompletion(true) == true ? 0 : -1; }
From source file:boostingPL.driver.SAMMEPLDriver.java
License:Open Source License
@Override public int run(String[] args) throws Exception { int status = commandAnalysis(args); if (status != 0) { return status; }/*from w ww . ja va 2 s . com*/ @SuppressWarnings("deprecation") Job job = new Job(getConf()); job.setJobName("SAMMEPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " " + numLinesPerMap + " " + numIterations); job.setJarByClass(SAMMEPLDriver.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dataPath); NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap); FileSystem fs = modelPath.getFileSystem(getConf()); if (fs.exists(modelPath)) { fs.delete(modelPath, true); } job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, modelPath); if (runModel.equals("train")) { job.setMapperClass(AdaBoostPLMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ClassifierWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(ClassifierWritable.class); } else { job.setMapperClass(AdaBoostPLTestMapper.class); job.setReducerClass(AdaBoostPLTestReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); } Configuration conf = job.getConfiguration(); conf.set("BoostingPL.boostingName", "SAMME"); conf.set("BoostingPL.numIterations", String.valueOf(numIterations)); conf.set("BoostingPL.modelPath", modelPath.toString()); if (metadataPath == null) { conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata"); } else { conf.set("BoostingPL.metadata", metadataPath.toString()); } if (outputFolder != null) { conf.set("BoostingPL.outputFolder", outputFolder.toString()); } LOG.info(StringUtils.arrayToString(args)); return job.waitForCompletion(true) == true ? 0 : -1; }
From source file:cascading.flow.hadoop.MapReduceFlow.java
License:Open Source License
protected Map<String, Tap> createSources(JobConf jobConf) { Path[] paths = FileInputFormat.getInputPaths(jobConf); if (paths.length == 0) { try {// ww w . jav a 2s . c o m paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(new Job(jobConf)); } catch (IOException exception) { throw new CascadingException(exception); } } Map<String, Tap> taps = new HashMap<String, Tap>(); for (Path path : paths) taps.put(path.toString(), new Hfs(new NullScheme(), path.toString())); return taps; }
From source file:cascading.flow.hadoop.MapReduceFlow.java
License:Open Source License
protected Map<String, Tap> createSinks(JobConf jobConf) { Map<String, Tap> taps = new HashMap<String, Tap>(); Path path = FileOutputFormat.getOutputPath(jobConf); if (path == null) { try {//w ww.j a v a2 s . c o m path = org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.getOutputPath(new Job(jobConf)); } catch (IOException exception) { throw new CascadingException(exception); } } taps.put(path.toString(), new Hfs(new NullScheme(), path.toString(), deleteSinkOnInit ? SinkMode.REPLACE : SinkMode.KEEP)); return taps; }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testCascade() throws IOException { getPlatform().copyFromLocal(inputFileApache); // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); String sinkPath4 = getOutputPath("flow4"); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE); Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); String sinkPath5 = getOutputPath("flow5"); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE); Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = HadoopPlanner.createJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true))); String sinkPath1 = getOutputPath("flow1"); FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true))); String sinkPath2 = getOutputPath("flow2"); FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); Job job = new Job(defaultConf); job.setJobName("third-mr"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.getConfiguration().set("mapred.mapper.new-api", "true"); job.getConfiguration().set("mapred.reducer.new-api", "true"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true))); String sinkPath3 = getOutputPath("flow3"); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(remove(sinkPath3, true))); Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); cascade.complete();//from w w w . ja va 2s. c o m validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10); }