List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);//from w ww .j a v a2 s. c o m } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:com.yahoo.semsearch.fastlinking.utils.RunFELOntheGrid.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf); //Job job = Job.getInstance( conf ); job.setJarByClass(RunFELOntheGrid.class); // Process custom command-line options Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // Specify various job-specific parameters job.setJobName("Entity Linker"); job.setNumReduceTasks(100);//from w w w . j av a 2 s . co m job.setJarByClass(RunFELOntheGrid.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(FELMapper.class); job.setReducerClass(FELReducer.class); job.setCombinerClass(FELReducer.class); job.waitForCompletion(true); return 0; }
From source file:com.yahoo.semsearch.fastlinking.w2v.EntityEmbeddings.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf); //Job job = Job.getInstance( conf ); job.setJarByClass(EntityEmbeddings.class); // Process custom command-line options Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // Specify various job-specific parameters job.setJobName("Entity embeddings"); job.setNumReduceTasks(1);//from w ww. j a va 2s. co m job.setJarByClass(EntityEmbeddings.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(EntityEMapper.class); job.setReducerClass(EntityEReducer.class); job.setCombinerClass(EntityEReducer.class); job.waitForCompletion(true); return 0; }
From source file:com.yassergonzalez.pagerank.PageRank.java
License:Apache License
private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception { // This job performs an iteration of the power iteration method to // compute PageRank. The map task processes each block M_{i,j}, loads // the corresponding stripe j of the vector v_{k-1} and produces the // partial result of the stripe i of the vector v_k. The reduce task // sums all the partial results of v_k and adds the teleportation factor // (the combiner only sums all the partial results). See Section 5.2 // (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The // output is written in a "vk" subdir of the output dir, where k is the // iteration number. MapFileOutputFormat is used to keep an array of the // stripes of v. Job job = Job.getInstance(conf, "PageRank:Iteration"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankIterationMapper.class); job.setMapOutputKeyClass(ShortWritable.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setCombinerClass(PageRankIterationCombiner.class); job.setReducerClass(PageRankIterationReducer.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(ShortWritable.class); job.setOutputValueClass(FloatArrayWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "M")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter)); job.waitForCompletion(true);//from w w w . j a va 2 s . c om }
From source file:com.zjy.mongo.util.MongoTool.java
License:Apache License
private int runMapReduceJob(final Configuration conf) throws IOException { final Job job = Job.getInstance(conf, getJobName()); /**/* www. j a va 2 s . com*/ * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf); if (LOG.isDebugEnabled()) { LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI)); } job.setMapperClass(mapper); Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MongoConfigUtil.getReducer(conf)); job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MongoConfigUtil.isJobBackground(conf); try { if (background) { LOG.info("Setting up and running MapReduce job in background."); job.submit(); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); return job.waitForCompletion(true) ? 0 : 1; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:counting.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); final int NUMBER_OF_NODES = 31; final int MAX_NUMBER_OF_TASKS = 1000; final double REDUCER_CONSTANT = 0.95; // or 1.75 if (otherArgs.length < 5) { System.err.println(//from w w w .j a v a2s. c o m "Usage: wordcount <in> [<in>...] <out> <ngram> <combiner:yes/no> <custom partioner:yes/no>"); System.exit(2); } Job job = Job.getInstance(conf, "Word count"); // Setting map and reduce tasks //conf.setNumMapTasks(5); // Not possible with code in line? int NUMBER_OF_REDUCERS = (int) REDUCER_CONSTANT * NUMBER_OF_NODES * MAX_NUMBER_OF_TASKS; //System.out.println("Number of Reducers: " + NUMBER_OF_REDUCERS); job.setNumReduceTasks(12); // Placeholder job.setJarByClass(WordCount.class); job.setMapperClass(nGramMapper.class); nGramMapper.setN(Integer.parseInt(otherArgs[otherArgs.length - 3])); // Set ngram length System.out.println("n = " + nGramMapper.getN()); System.out.println("Combiner = " + otherArgs[otherArgs.length - 2]); System.out.println("Custom Partitioner = " + otherArgs[otherArgs.length - 1]); System.out.println("Number of reducers = " + NUMBER_OF_NODES); if (otherArgs[otherArgs.length - 2].equals("yes")) { job.setCombinerClass(IntSumReducer.class); } if (otherArgs[otherArgs.length - 1].equals("yes")) { job.setPartitionerClass(CustomPartitioner.class); //CustomPartitioner.setNumberOfReducers(NUMBER_OF_REDUCERS); } job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Input paths for (int i = 0; i < otherArgs.length - 4; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } // Output paths FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 4])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cp_a.CP_A.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(CP_A.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: MaxTemperatureWithCombiner <input path> " + "<output path>"); System.exit(-1);//from ww w .ja v a 2s . c o m } Job job = new Job(); job.setJarByClass(MaxTemperatureWithCombiner.class); job.setJobName("Max temperature"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(MaxTemperatureMapper.class); /*[*/job.setCombinerClass(MaxTemperatureReducer.class)/*]*/; job.setReducerClass(MaxTemperatureReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: MaxTemperatureWithCompression <input path> " + "<output path>"); System.exit(-1);//from ww w . j a v a2 s. c o m } Job job = new Job(); job.setJarByClass(MaxTemperature.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); /*[*/FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);/*]*/ job.setMapperClass(MaxTemperatureMapper.class); job.setCombinerClass(MaxTemperatureReducer.class); job.setReducerClass(MaxTemperatureReducer.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: MaxTemperatureWithMapOutputCompression " + "<input path> <output path>"); System.exit(-1);//from w w w.ja v a2 s .c o m } // vv MaxTemperatureWithMapOutputCompression Configuration conf = new Configuration(); conf.setBoolean("mapred.compress.map.output", true); conf.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class); Job job = new Job(conf); // ^^ MaxTemperatureWithMapOutputCompression job.setJarByClass(MaxTemperature.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(MaxTemperatureMapper.class); job.setCombinerClass(MaxTemperatureReducer.class); job.setReducerClass(MaxTemperatureReducer.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }