List of usage examples for org.apache.hadoop.mapreduce Job setGroupingComparatorClass
public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase4RemoveDuplicatesUsingReduceSideJoins.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(Phase4RemoveDuplicatesUsingReduceSideJoins.class); job.setJobName(Phase4RemoveDuplicatesUsingReduceSideJoins.class.getName()); // paths// www.j a v a 2 s .c om // text files of ids to be deleted String textFilePath = args[0]; // corpus with *.warc.gz String commaSeparatedInputFiles = args[1]; // output String outputPath = args[2]; //second input the look up text file MultipleInputs.addInputPath(job, new Path(textFilePath), TextInputFormat.class, JoinTextMapper.class); //first input the data set (check comma separated availability) MultipleInputs.addInputPath(job, new Path(commaSeparatedInputFiles), WARCInputFormat.class, JoinWARCMapper.class); job.setPartitionerClass(SourceJoiningKeyPartitioner.class); job.setGroupingComparatorClass(SourceJoiningGroupingComparator.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(WARCWritable.class); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:demo.SsJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SsJob.class); job.setPartitionerClass(NaturalKeyPartitioner.class); job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class); job.setSortComparatorClass(CompositeKeyComparator.class); job.setMapOutputKeyClass(StockKey.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(SsMapper.class); job.setReducerClass(SsReducer.class); job.waitForCompletion(true);//from w w w .java 2 s. c o m return 0; }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/* w ww .j av a 2s .c o m*/ */ @SuppressWarnings("deprecation") private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:edu.umd.shrawanraina.UserLocation.java
License:Apache License
private void runJob2(String basePath, boolean useCombiner, boolean useInMapperCombiner) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJobName(UserLocation.class.getSimpleName()); job.setJarByClass(UserLocation.class); // We need to actually count the number of part files to get the number // of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(basePath))) { if (s.getPath().getName().contains("part-")) numPartitions++;//from ww w . j a v a 2 s . c o m } job.setNumReduceTasks(numPartitions); FileInputFormat.setInputPaths(job, new Path(basePath)); String outputPath = basePath + "-out"; FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStringInt.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(PairOfStringInt.class); job.setOutputValueClass(NullWritable.class); job.setMapperClass(MapClass2.class); //job.setCombinerClass(ReduceClass2.class); job.setReducerClass(ReduceClass2.class); //job.setPartitionerClass(CustomKeyPartitioner.class); job.setGroupingComparatorClass(CustomGroupingComparator.class); job.setSortComparatorClass(CustomKeyComparator.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //return 0; }
From source file:flink.applications.model.fraud.prepare.Projection.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Projection and grouping MR"; job.setJobName(jobName);/*from w ww . jav a 2s . c o m*/ job.setJarByClass(Projection.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration()); String operation = job.getConfiguration().get("projection.operation", "project"); if (operation.startsWith("grouping")) { //group by job.setMapperClass(Projection.ProjectionMapper.class); job.setReducerClass(Projection.ProjectionReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1)); //order by boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0; if (doOrderBy) { job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class); } } else { //simple projection job.setMapperClass(Projection.SimpleProjectionMapper.class); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:hamr.core.general.job.GeneralJob.java
License:Open Source License
public static void generalization(Class<? extends AnnotedBean> abClass, Job job) { job.setMapperClass(GeneralMapper.class); job.setPartitionerClass(GeneralPartitioner.class); job.setMapOutputKeyClass(abClass);/* ww w . j av a2s. c o m*/ job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(GeneralReducer.class); job.setGroupingComparatorClass(GeneralGroupComparator.class); }
From source file:hk.newsRecommender.TFIDF2.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part0---------------------------------------------------- Job job0 = Job.getInstance(conf, "sfitSingleNews"); Path output0Path = new Path(hdfsUrl + "/data/recommend/tfidf0"); HadoopUtil.delete(conf, output0Path); job0.setJarByClass(TFIDF.class); job0.setMapperClass(Mapper_Part0.class); // job1.setCombinerClass(Combiner_Part1.class); // combiner? job0.setReducerClass(Reduce_Part0.class); job0.setMapOutputKeyClass(Text.class); job0.setMapOutputValueClass(Text.class); job0.setOutputKeyClass(Text.class); job0.setOutputValueClass(Text.class); // job1.setNumReduceTasks(p.length); FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/data2.txt")); FileOutputFormat.setOutputPath(job0, output0Path); job0.waitForCompletion(true);//from ww w .j av a 2 s.c o m // part1---------------------------------------------------- Job job1 = Job.getInstance(conf, "computeTF"); Path outputPath1 = new Path(hdfsUrl + "/data/recommend/tfidf1"); HadoopUtil.delete(conf, outputPath1); job1.setJarByClass(TFIDF.class); job1.setMapperClass(Mapper_Part1.class); job1.setReducerClass(Reduce_Part1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(Text.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf0")); FileOutputFormat.setOutputPath(job1, outputPath1); job1.waitForCompletion(true); // part2---------------------------------------- Job job2 = Job.getInstance(conf, "computeTFIDF"); Path outputPath2 = new Path(hdfsUrl + "/data/recommend/tfidf2"); HadoopUtil.delete(conf, outputPath2); job2.setJarByClass(TFIDF.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setMapperClass(Mapper_Part2.class); job2.setReducerClass(Reduce_Part2.class); FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/tfidf1")); FileOutputFormat.setOutputPath(job2, outputPath2); job2.waitForCompletion(true); // part3---------------------------------------- Configuration conf3 = new Configuration(); Path outputPath3 = new Path(hdfsUrl + "/data/recommend/tfidf3"); HadoopUtil.delete(conf, outputPath3); Job job3 = Job.getInstance(conf3, "My_tdif_part3"); job3.setMapperClass(Mapper_Part3.class); job3.setReducerClass(Reduce_Part3.class); job3.setMapOutputKeyClass(CustomKey.class); job3.setMapOutputValueClass(NullWritable.class); job3.setOutputKeyClass(CustomKey.class); job3.setOutputValueClass(NullWritable.class); job3.setGroupingComparatorClass(CustomGroupComparator.class); job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/tfidf2")); FileOutputFormat.setOutputPath(job3, outputPath3); job3.waitForCompletion(true); }
From source file:hk.newsRecommender.TFIDFClassify.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part1---------------------------------------------------- Job job1 = Job.getInstance(conf, "computeTF"); Path outputPath1 = new Path(hdfsUrl + "/data/recommend/class1/tfidf1"); HadoopUtil.delete(conf, outputPath1); job1.setJarByClass(TFIDFClassify.class); job1.setMapperClass(Mapper_Part1.class); job1.setReducerClass(Reduce_Part1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(Text.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(Text.class); job1.setPartitionerClass(MyPartitoner.class); // MyPartitoner FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/data3.txt")); FileOutputFormat.setOutputPath(job1, outputPath1); job1.waitForCompletion(true);//from w ww. j av a2s. c o m // part2---------------------------------------- Job job2 = Job.getInstance(conf, "computIDF"); Path outputPath2 = new Path(hdfsUrl + "/data/recommend/class1/tfidf2"); HadoopUtil.delete(conf, outputPath2); job2.setJarByClass(TFIDFClassify.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); job2.setMapperClass(Mapper_Part2.class); job2.setReducerClass(Reduce_Part2.class); FileInputFormat.setInputPaths(job2, new Path(hdfsUrl + "/data/recommend/class1/tfidf1")); FileOutputFormat.setOutputPath(job2, outputPath2); job2.waitForCompletion(true); // part3---------------------------------------- Job job3 = Job.getInstance(conf, "sortByTFIDFDec"); Path outputPath3 = new Path(hdfsUrl + "/data/recommend/class1/tfidf3"); HadoopUtil.delete(conf, outputPath3); job3.setMapperClass(Mapper_Part3.class); job3.setReducerClass(Reduce_Part3.class); job3.setMapOutputKeyClass(CustomKey.class); job3.setMapOutputValueClass(NullWritable.class); job3.setOutputKeyClass(CustomKey.class); job3.setOutputValueClass(NullWritable.class); job3.setGroupingComparatorClass(CustomGroupComparator.class); job3.setPartitionerClass(CustomPartitioner.class); // MyPartitoner FileInputFormat.addInputPath(job3, new Path(hdfsUrl + "/data/recommend/class1/tfidf2")); FileOutputFormat.setOutputPath(job3, outputPath3); job3.waitForCompletion(true); // part4---------------??------------------------- // Job job4 = Job.getInstance(conf, "siftKeywords"); // Path outputPath4=new Path(hdfsUrl + "/data/recommend/class1/matrix1"); // HadoopUtil.delete(conf, outputPath4); // job4.setJarByClass(TFIDF.class); // job4.setMapperClass(Mapper_Part4.class); // job4.setReducerClass(Reduce_Part4.class); // job4.setMapOutputKeyClass(Text.class); // job4.setMapOutputValueClass(Text.class); // job4.setOutputKeyClass(Text.class); // job4.setOutputValueClass(Text.class); // job4.setPartitionerClass(CustomPartitioner.class); // FileInputFormat.addInputPath(job4, new Path(hdfsUrl + "/data/recommend/class1/tfidf3")); // FileOutputFormat.setOutputPath(job4, outputPath4); // job4.waitForCompletion(true); // part5---------------------------------------- FileSystem fsopen = FileSystem.get(conf); FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000")); Scanner scan = new Scanner(in); List<String> keywordList = new ArrayList<String>(); while (scan.hasNext()) { keywordList.add(scan.next()); } // must before job conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()])); Job job5 = Job.getInstance(conf, "generateMatrix"); Path outputPath5 = new Path(hdfsUrl + "/data/recommend/class1/matrix2"); HadoopUtil.delete(conf, outputPath5); job5.setJarByClass(TFIDF.class); job5.setMapperClass(Mapper_Part5.class); job5.setReducerClass(Reduce_Part5.class); job5.setMapOutputKeyClass(Text.class); job5.setMapOutputValueClass(Text.class); job5.setOutputKeyClass(Text.class); job5.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job5, new Path(hdfsUrl + "/data/recommend/class1/tfidf3")); FileOutputFormat.setOutputPath(job5, outputPath5); job5.waitForCompletion(true); }
From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java
License:Apache License
/** * Returns a properly configured, ready to run Hadoop {@link Job}. * * @param args the command line arguments as supported by {@link GenericOptionsParser} * * @return the configured job//from w w w . j ava 2 s .c o m * * @throws IOException if there is a problem creating the job * @throws ExecutionException if there is an issue running the Node.js module * @throws InterruptedException if the execution of the Node.js module gets interrupted * @throws NodeException if there is an issue with the Node.js module */ public Job initJob(final String[] args) throws ExecutionException, InterruptedException, IOException, NodeException { final GenericOptionsParser gop = new GenericOptionsParser(args); // If ran from ToolRunner, conf should already be set but if not, set it manually if (conf == null) { setConf(gop.getConfiguration()); } // Load the Hadoop FS URL handler RunnerUtils.loadFsUrlStreamHandler(getConf()); // Persist the non-Runner CLI arguments conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs()); // Package the Node.js module and prepare it to be submitted with the Job RunnerUtils.prepareModuleForJob(conf); // Add "-libjars" to the current ClassLoader if necessary RunnerUtils.addLibJarsToClassLoader(conf); // Create Node.js environment for local use mrEnv = LembosMapReduceEnvironment.fromConf(conf); if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) { for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) { final String key = propertyEntry.getKey().toString(); final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule()); // Do not set these as we'll be setting them later from values we were passed from the CLI if (key.equals(LembosConstants.MR_MODULE_NAME)) { continue; } if (value instanceof BooleanWritable) { conf.setBoolean(key, ((BooleanWritable) value).get()); } else if (value instanceof DoubleWritable || value instanceof FloatWritable) { conf.setFloat(key, Float.valueOf(value.toString())); } else if (value instanceof IntWritable) { conf.setInt(key, ((IntWritable) value).get()); } else if (value instanceof LongWritable) { conf.setLong(key, ((LongWritable) value).get()); } else if (value instanceof Text) { conf.set(key, value.toString()); } else { System.err.println("Cannot convert JavaScript (" + value.getClass().getName() + ") to Configuration, using String"); conf.set(key, value.toString()); } } } // Create Job final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName(); final Job job = new Job(conf, jobName); jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job); if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) { mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper }); } // Always set the mapper job.setMapperClass(LembosMapper.class); // Conditionally set the combiner if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) { job.setCombinerClass(LembosCombiner.class); } // Conditionally set the group comparator if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) { job.setGroupingComparatorClass(LembosGroupComparator.class); } // Conditionally set the partitioner if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) { job.setPartitionerClass(LembosPartitioner.class); } // Conditionally set the reducer if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) { job.setReducerClass(LembosReducer.class); } else { job.setNumReduceTasks(0); } // Conditionally set the sort comparator if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) { job.setSortComparatorClass(LembosSortComparator.class); } // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR if (job.getJar() == null) { job.setJarByClass(LembosMapReduceRunner.class); } // MapReduce configuration reference: // // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // org.apache.hadoop.mapreduce.MRConfig // org.apache.hadoop.mapreduce.MRJobConfig return job; }
From source file:it.crs4.seal.demux.Demux.java
License:Open Source License
@Override public int run(String[] args) throws Exception { LOG.info("starting"); Configuration conf = getConf(); DemuxOptionParser parser = new DemuxOptionParser(); parser.parse(conf, args);// w ww. j a va 2 s . c o m conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads()); conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads()); LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); if (parser.getNoIndexReads()) LOG.info("Not expecting to find any index reads. Will demultiplex based only on lane."); // load sample sheet to fail early in case of problems DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf); // must be called before creating the job, since the job // *copies* the Configuration. distributeSampleSheet(parser.getSampleSheetPath()); // Create a Job using the processed conf Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0))); job.setJarByClass(Demux.class); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq"))); job.setMapperClass(Map.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(SequencedFragment.class); job.setPartitionerClass(SequenceIdLocationPartitioner.class); job.setGroupingComparatorClass(GroupByLocationComparator.class); job.setSortComparatorClass(TwoOneThreeSortComparator.class); job.setReducerClass(Red.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SequencedFragment.class); // output job.setOutputFormatClass(DemuxOutputFormat.class); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); if (parser.getCreateLaneContent()) createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath()); return 0; } else { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } }