List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass
public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:CountJob.java
License:Apache License
public static void doJob(String param, String args[], String msgs) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); conf.set(TokenizerMapper.PATTERN, args[2]); FileSystem hdfs = FileSystem.get(conf); Path tempOutput1 = new Path("/data/output/temp/" + param + "1"); Path tempOutput2 = new Path("/data/output/temp/" + param + "2"); if (hdfs.exists(tempOutput1) || hdfs.exists(tempOutput2)) { hdfs.delete(tempOutput1, true);/*from ww w . j ava2s . com*/ hdfs.delete(tempOutput2, true); } Job job = new Job(conf, "word count"); job.setJarByClass(CountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, tempOutput1); job.waitForCompletion(true); Job sortJob1 = new Job(conf); sortJob1.setJobName("grep-sort"); FileInputFormat.setInputPaths(sortJob1, tempOutput1); sortJob1.setInputFormatClass(SequenceFileInputFormat.class); sortJob1.setMapperClass(InverseMapper.class); sortJob1.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob1, tempOutput2); sortJob1.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob1.waitForCompletion(true); hdfs.delete(tempOutput1, true); }
From source file:Edge.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w w w.j a v a 2 s .c o m*/ } Path tempDir = new Path("/temp/edge"); Job job = new Job(conf, "word count"); job.setJarByClass(Edge.class); job.setMapperClass(SplitMapper.class); job.setCombinerClass(DuplicateCombiner.class); //job.setSortComparatorClass(DecentComparator.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, tempDir); if (job.waitForCompletion(true)) { Job job2 = new Job(conf, "edge"); job2.setJarByClass(Edge.class); job2.setMapperClass(SwitchMapper.class); job2.setSortComparatorClass(DecentComparator.class); job2.setReducerClass(SwitchReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job2, tempDir); FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1])); System.exit(job2.waitForCompletion(true) ? 0 : 1); } System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:DescSorter.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: flights <in> <in> <out>"); System.exit(2);/*from www .ja v a 2s. c om*/ } Job job = new Job(conf, "AvgDelays"); job.setJarByClass(DescSorter.class); job.setMapperClass(FlightMapper.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(IntWritable.class); job.setPartitionerClass(CompositeKeyPartitioner.class); job.setSortComparatorClass(SortComparator.class); job.setGroupingComparatorClass(GroupingComparator.class); job.setReducerClass(AvgDelayReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:GenIndex.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/* ww w. j a va 2 s . c o m*/ } String tmpPath = "/local_scratch/wordcount/tmp"; String stopWord = "/local_scratch/wordcount/stopword"; // Job to count the words Job count_job = new Job(conf, "word count"); count_job.setJarByClass(GenIndex.class); count_job.setMapperClass(Mapper1_Count.class); count_job.setCombinerClass(Reducer1_Count.class); count_job.setReducerClass(Reducer1_Count.class); count_job.setOutputKeyClass(Text.class); count_job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(count_job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(count_job, new Path(tmpPath)); count_job.waitForCompletion(true); Job sort_job = new Job(conf, "word sort"); sort_job.setJarByClass(GenIndex.class); sort_job.setMapperClass(Mapper2_Sort.class); sort_job.setCombinerClass(Reducer2_Sort.class); sort_job.setReducerClass(Reducer2_Sort.class); sort_job.setSortComparatorClass(SortReducerByValuesKeyComparator.class); sort_job.setOutputKeyClass(IntWritable.class); sort_job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(sort_job, new Path(tmpPath)); FileOutputFormat.setOutputPath(sort_job, new Path(stopWord)); sort_job.waitForCompletion(true); // job to generate the index Job index_job = new Job(conf, "word index"); index_job.setJarByClass(GenIndex.class); index_job.setMapperClass(Mapper3_index.class); index_job.setCombinerClass(Reducer3_index.class); index_job.setReducerClass(Reducer3_index.class); index_job.setOutputKeyClass(Text.class); index_job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(index_job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(index_job, new Path(otherArgs[1])); index_job.waitForCompletion(true); System.exit(0); }
From source file:Top20AZRestaurants.java
@Override public int run(String[] args) throws Exception { Job job1 = new Job(getConf()); job1.setSortComparatorClass(MyDecreasingDoubleComparator.class); job1.setJobName("Top20 AZ Restaurants ChainJob"); job1.setJarByClass(Top20AZRestaurants.class); JobConf map1Conf = new JobConf(false); ChainMapper.addMapper(job1, Top20Mapper.class, LongWritable.class, Text.class, Text.class, Text.class, map1Conf);/*from w w w.j a va2 s.c om*/ JobConf map2Conf = new JobConf(false); ChainMapper.addMapper(job1, Top20MapperRedo.class, Text.class, Text.class, DoubleWritable.class, Text.class, map2Conf); JobConf reduceConf = new JobConf(false); ChainReducer.setReducer(job1, Top20ReducerRedo.class, DoubleWritable.class, Text.class, Text.class, DoubleWritable.class, reduceConf); FileInputFormat.setInputPaths(job1, new Path(args[0])); FileOutputFormat.setOutputPath(job1, new Path(args[1])); boolean success = job1.waitForCompletion(true); return success ? 0 : 1; }
From source file:adts.PopularKeywords.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "PopularKeywords"); job.setJarByClass(PopularKeywords.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); Path queriesInputPath = new Path(args[0]); Path StopWordsInputPath = new Path(args[1]); MultipleInputs.addInputPath(job, queriesInputPath, TextInputFormat.class, Map.class); MultipleInputs.addInputPath(job, StopWordsInputPath, TextInputFormat.class, StopwordsMap.class); FileOutputFormat.setOutputPath(job, new Path("/root/temporary")); job.waitForCompletion(true);/*w w w.j a v a 2 s .co m*/ Job sortingJob = new Job(conf, "PopularKeywords"); sortingJob.setJarByClass(PopularKeywords.class); sortingJob.setOutputKeyClass(Text.class); sortingJob.setOutputValueClass(LongWritable.class); sortingJob.setMapperClass(ReverseMap.class); sortingJob.setReducerClass(ReverseReduce.class); sortingJob.setInputFormatClass(TextInputFormat.class); sortingJob.setOutputFormatClass(TextOutputFormat.class); sortingJob.setSortComparatorClass(LongWritable.DecreasingComparator.class); sortingJob.setMapOutputKeyClass(LongWritable.class); sortingJob.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(sortingJob, new Path("/root/temporary")); FileOutputFormat.setOutputPath(sortingJob, new Path(args[2])); sortingJob.setNumReduceTasks(1); sortingJob.waitForCompletion(true); }
From source file:Assignment3_P5_Top25Movies.Top25MovieRatingDriver.java
/** * @param args the command line arguments *///from w w w .java2s . c o m public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job1 = Job.getInstance(conf, "Movie Rating Count"); job1.setJarByClass(Top25MovieRatingDriver.class); // the usual - get basic mapred ready job1.setMapperClass(Top25MovieRating_Mapper.class); job1.setCombinerClass(Top25MovieRating_Reducer.class); job1.setReducerClass(Top25MovieRating_Reducer.class); // this will basically out -> movieId, average rating job1.setOutputKeyClass(IntWritable.class); job1.setOutputValueClass(FloatWritable.class); FileInputFormat.addInputPath(job1, new Path(args[0])); FileOutputFormat.setOutputPath(job1, new Path(args[1])); boolean complete = job1.waitForCompletion(true); // here's where we sort Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2, "Movie Rating Count"); if (complete) { job2.setJarByClass(Top25MovieRatingDriver.class); // namesake fellow, take it and go types - mostly useless job2.setMapperClass(Top25MovieRating_Mapper1.class); job2.setMapOutputKeyClass(FloatWritable.class); job2.setMapOutputValueClass(IntWritable.class); // this is where we would ideally sort descendingly job2.setSortComparatorClass(Top25MovieRating_SortComparator.class); // o/p top 25, man job2.setNumReduceTasks(1); job2.setReducerClass(Top25MovieRating_Reducer1.class); job2.setOutputKeyClass(FloatWritable.class); job2.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job2, new Path(args[1])); FileOutputFormat.setOutputPath(job2, new Path(args[2])); System.exit(job2.waitForCompletion(true) ? 0 : 1); } }
From source file:avro.mr.MapReduceAvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: AvroWordCount <input path> <output path>"); return -1; }//ww w . j av a 2s. c o m Job job = Job.getInstance(getConf()); job.setJarByClass(MapReduceAvroWordCount.class); job.setJobName("wordcount"); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setSortComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); return 0; }
From source file:be.ugent.intec.halvade.MapReduceRunner.java
License:Open Source License
protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException { HalvadeConf.setIsPass2(pass1Conf, false); HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, true, halvadeOpts.useBamInput);//from w w w. j a va 2s. c om Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline"); pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries)); pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class); FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf); try { if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) { // add every file in directory FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in)); for (FileStatus file : files) { if (!file.isDirectory()) { FileInputFormat.addInputPath(pass1Job, file.getPath()); } } } else { FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in)); } } catch (IOException | IllegalArgumentException e) { Logger.EXCEPTION(e); } FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf); boolean skipPass1 = false; if (outFs.exists(new Path(tmpOutDir))) { // check if genome already exists skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS")); if (skipPass1) Logger.DEBUG("pass1 genome already created, skipping pass 1"); else { Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists."); Logger.INFO("ERROR: Please remove this directory before trying again."); System.exit(-2); } } if (!skipPass1) { FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir)); pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class); pass1Job.setInputFormatClass(HalvadeTextInputFormat.class); pass1Job.setMapOutputKeyClass(GenomeSJ.class); pass1Job.setMapOutputValueClass(Text.class); pass1Job.setSortComparatorClass(GenomeSJSortComparator.class); pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class); pass1Job.setNumReduceTasks(1); pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class); pass1Job.setOutputKeyClass(LongWritable.class); pass1Job.setOutputValueClass(Text.class); return runTimedJob(pass1Job, "Halvade pass 1 Job"); } else return 0; }
From source file:be.ugent.intec.halvade.MapReduceRunner.java
License:Open Source License
protected int runHalvadeJob(Configuration halvadeConf, String tmpOutDir, int jobType) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException { String pipeline = ""; if (jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) { HalvadeConf.setIsPass2(halvadeConf, true); HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false, halvadeOpts.useBamInput); pipeline = RNA_PASS2;// ww w . j a va 2 s .c o m } else if (jobType == HalvadeResourceManager.DNA) { HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false, halvadeOpts.useBamInput); pipeline = DNA; } HalvadeConf.setOutDir(halvadeConf, tmpOutDir); FileSystem outFs = FileSystem.get(new URI(tmpOutDir), halvadeConf); if (outFs.exists(new Path(tmpOutDir))) { Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists."); Logger.INFO("ERROR: Please remove this directory before trying again."); System.exit(-2); } if (halvadeOpts.useBamInput) setHeaderFile(halvadeOpts.in, halvadeConf); Job halvadeJob = Job.getInstance(halvadeConf, "Halvade" + pipeline); halvadeJob.addCacheArchive(new URI(halvadeOpts.halvadeBinaries)); halvadeJob.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class); addInputFiles(halvadeOpts.in, halvadeConf, halvadeJob); FileOutputFormat.setOutputPath(halvadeJob, new Path(tmpOutDir)); if (jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) { halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class); halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RnaGATKReducer.class); } else if (jobType == HalvadeResourceManager.DNA) { halvadeJob.setMapperClass(halvadeOpts.alignmentTools[halvadeOpts.aln]); halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.DnaGATKReducer.class); } halvadeJob.setMapOutputKeyClass(ChromosomeRegion.class); halvadeJob.setMapOutputValueClass(SAMRecordWritable.class); halvadeJob.setInputFormatClass(HalvadeTextInputFormat.class); halvadeJob.setOutputKeyClass(Text.class); if (halvadeOpts.mergeBam) { halvadeJob.setSortComparatorClass(SimpleChrRegionComparator.class); halvadeJob.setOutputValueClass(SAMRecordWritable.class); } else { halvadeJob.setPartitionerClass(ChrRgPartitioner.class); halvadeJob.setSortComparatorClass(ChrRgSortComparator.class); halvadeJob.setGroupingComparatorClass(ChrRgGroupingComparator.class); halvadeJob.setOutputValueClass(VariantContextWritable.class); } if (halvadeOpts.justAlign) halvadeJob.setNumReduceTasks(0); else if (halvadeOpts.mergeBam) { halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.BamMergeReducer.class); halvadeJob.setNumReduceTasks(1); } else halvadeJob.setNumReduceTasks(halvadeOpts.reduces); if (halvadeOpts.useBamInput) { halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.AlignedBamMapper.class); halvadeJob.setInputFormatClass(BAMInputFormat.class); } return runTimedJob(halvadeJob, "Halvade Job"); }