List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:io.fluo.stress.trie.Init.java
License:Apache License
private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Init.class); job.setJobName(Init.class.getName() + "_load"); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize); job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums")); job.setMapperClass(InitMapper.class); job.setCombinerClass(InitCombiner.class); job.setReducerClass(InitReducer.class); job.setOutputFormatClass(AccumuloFileOutputFormat.class); job.setPartitionerClass(RangePartitioner.class); FileSystem fs = FileSystem.get(job.getConfiguration()); Connector conn = AccumuloUtil.getConnector(props); Path splitsPath = new Path(tmp, "splits.txt"); Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath); RangePartitioner.setSplitFile(job, splitsPath.toString()); job.setNumReduceTasks(splits1.size() + 1); Path outPath = new Path(tmp, "out"); AccumuloFileOutputFormat.setOutputPath(job, outPath); boolean success = job.waitForCompletion(true); if (success) { Path failPath = new Path(tmp, "failures"); fs.mkdirs(failPath);//from www . ja v a 2s .c om conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(), failPath.toString(), false); } return success ? 0 : 1; }
From source file:io.gzinga.hadoop.TestSplittableGZipCodec.java
License:Apache License
@Test public void testSplittableGZipCodec() { try {/*from w w w . ja v a 2 s . c o m*/ Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); FileSystem fs = FileSystem.get(conf); fs.mkdirs(new Path("target/test")); GZipOutputStreamRandomAccess gzip = new GZipOutputStreamRandomAccess( fs.create(new Path("target/test/testfile1.gz"))); String str = "This is line\n"; for (int i = 1; i <= 10000; i++) { gzip.write(str.getBytes()); if (i % 100 == 0) { gzip.addOffset(i / 100l); } } Assert.assertEquals(gzip.getOffsetMap().size(), 100); gzip.close(); conf.set("mapreduce.framework.name", "local"); conf.set("io.compression.codecs", "io.gzinga.hadoop.SplittableGZipCodec"); conf.set("mapreduce.input.fileinputformat.split.maxsize", "20000"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(WordCount.TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path("target/test/testfile1.gz")); FileOutputFormat.setOutputPath(job, new Path("target/test/testfile2")); job.waitForCompletion(true); BufferedReader br = new BufferedReader( new InputStreamReader(fs.open(new Path("target/test/testfile2/part-r-00000")))); Assert.assertEquals("This\t10000", br.readLine()); Assert.assertEquals("is\t10000", br.readLine()); Assert.assertEquals("line\t10000", br.readLine()); br.close(); } catch (Exception e) { e.printStackTrace(); Assert.fail(); } finally { FileUtil.fullyDelete(new File("target/test/testfile2")); FileUtil.fullyDelete(new File("target/test/testfile1.gz")); } }
From source file:io.ssc.trackthetrackers.extraction.hadoop.HadoopJob.java
License:Open Source License
protected Job mapReduce(Path input, Path output, Class inputFormatClass, Class outputFormatClass, Class mapperClass, Class mapperKeyClass, Class mapperValueClass, Class reducerClass, Class reducerKeyClass, Class reducerValueClass, boolean combinable) throws IOException { Job job = map(input, output, inputFormatClass, outputFormatClass, mapperClass, mapperKeyClass, mapperValueClass);//from w ww . j av a 2 s .co m job.setReducerClass(reducerClass); job.setOutputKeyClass(reducerKeyClass); job.setOutputValueClass(reducerValueClass); if (combinable) { job.setCombinerClass(reducerClass); } return job; }
From source file:it.crs4.seal.recab.RecabTable.java
License:Open Source License
@Override public int run(String[] args) throws Exception { LOG.info("starting"); RecabTableOptionParser parser = new RecabTableOptionParser(); parser.parse(getConf(), args);/*from w ww. j av a2s .c o m*/ LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); // must be called before creating the job, since the job // *copies* the Configuration. distributeVariantsFile(parser); // Create a Job using the processed conf Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0)); job.setJarByClass(RecabTable.class); job.setInputFormatClass(FormatNameMap .getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam"))); LOG.info("Using input format " + job.getInputFormatClass().getName()); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObservationCount.class); job.setCombinerClass(Combiner.class); job.setReducerClass(Red.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // output FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); return 0; } else { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } }
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * /*from w ww. j a v a 2 s. c o m*/ * Count the frequencies of items * * @param params * @param conf */ public static void startParallelCounting(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input; Integer enableDiscretization = new Integer(params.get(ENABLE_DISCRETIZATION)); if (enableDiscretization.compareTo(new Integer(1)) == 0) { input = new Path(params.get(OUTPUT), DISC); } else { input = new Path(params.get(INPUT)); } Job job = new Job(conf, "Parallel Counting driver running over input: " + input); job.setJarByClass(ARM.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), ITEM_FREQ); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * Run the Parallel FPGrowth Map/Reduce job to calculate the Top K features * of group dependent shards// www . j a v a 2s . c o m */ public static void startParallelFPGrowth(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input; Integer enableDiscretization = new Integer(params.get(ENABLE_DISCRETIZATION)); if (enableDiscretization.compareTo(new Integer(1)) == 0) { input = new Path(params.get(OUTPUT), DISC); } else { input = new Path(params.get(INPUT)); } Job job = new Job(conf, "PFP Growth driver running over input" + input); job.setJarByClass(ARM.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(TransactionTree.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FPGROWTH); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelFPGrowthMapper.class); job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPGrowthReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:ivory.core.preprocess.ComputeGlobalTermStatistics.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int reduceTasks = 10; String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { LOG.info("index path doesn't existing: skipping!"); return 0; }/*from ww w . j ava 2 s. com*/ if (!fs.exists(new Path(termDocVectorsPath))) { LOG.info("term doc vectors path doesn't existing: skipping!"); return 0; } LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { LOG.info("TermDfCf directory exist: skipping!"); return 0; } Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":" + collectionName); job.setJarByClass(ComputeGlobalTermStatistics.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // Write out number of postings. NOTE: this value is not the same as // number of postings, because postings for non-English terms are // discarded, or as result of df cut. env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue()); env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue()); return 0; }
From source file:ivory.preprocess.GetTermCount2.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { LOG.info("index path doesn't existing: skipping!"); return 0; }/*from w ww . j a va2 s .c o m*/ LOG.info("PowerTool: GetTermCount2"); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { LOG.info("TermDfCf directory exist: skipping!"); return 0; } Job job = new Job(getConf(), "GetTermCount2:" + collectionName); job.setJarByClass(GetTermCount2.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PairOfIntLong.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // Write out number of postings. NOTE: this value is not the same as // number of postings, because postings for non-English terms are // discarded, or as result of df cut. env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue()); env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue()); return 0; }
From source file:javaapplication1.Object1.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }/* w w w . j a v a 2 s .co m*/
From source file:jdamasceno.hadoop.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); MongoConfigUtil.setInputURI(conf, "mongodb://localhost/tweets.tweets"); MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/tweets.count"); MongoConfigUtil.setSplitSize(conf, 4); System.out.println("Conf: " + conf); final Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }