List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:eu.scape_project.tb.wc.archd.mapreduce.TikaCharacterisation.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf); System.out.println(conf.get("mapreduce.job.user.classpath.first")); for (int i = 0; i < args.length; i++) { System.out.println("Arg" + i + ": " + args[i]); }//from w w w. j av a 2 s . c om FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(TikaCharacterisation.class); job.setJobName(name); //*** Set interface data types // We are using LONG because this value can become very large on huge archives. // In order to use the combiner function, also the map output needs to be a LONG. //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //*** Set up the mapper, combiner and reducer job.setMapperClass(TikaMap.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); //*** Set the MAP output compression //job.getConfiguration().set("mapred.compress.map.output", "true"); //*** Set input / output format job.setInputFormatClass(ArcInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //*** Start the job and wait for it boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:Example.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length != 2) { System.err.println("Usage: wordcount <HDFS input file> <HDFS output file>"); System.exit(2);/* w w w . j ava 2 s .c om*/ } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setNumReduceTasks(2); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.FixMate.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("fixmate :: WORKDIR not given."); return 3; }//from w ww . j a v a 2 s . com if (args.size() == 1) { System.err.println("fixmate :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue( stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate"); if (stringency == null) return 3; Path wrkDir = new Path(args.get(0)); final List<String> strInputs = args.subList(1, args.size()); final List<Path> inputs = new ArrayList<Path>(strInputs.size()); for (final String in : strInputs) inputs.add(new Path(in)); final Configuration conf = getConf(); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); if (stringency != null) conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); final boolean globalSort = parser.getBoolean(sortOpt); if (globalSort) Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname); conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0])); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); if (globalSort) Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(FixMate.class); job.setMapperClass(FixMateMapper.class); job.setReducerClass(FixMateReducer.class); if (!parser.getBoolean(noCombinerOpt)) job.setCombinerClass(FixMateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(AnySAMInputFormat.class); job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class); for (final Path in : inputs) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, wrkDir); if (globalSort) { job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("fixmate :: Sampling..."); t.start(); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); } job.submit(); System.out.println("fixmate :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("fixmate :: Job failed."); return 4; } System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("fixmate :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate"); } catch (IOException e) { System.err.printf("fixmate :: Output merging failed: %s\n", e); return 5; } return 0; }
From source file:fire.util.fileformats.combinetextfileinputformat.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();/* w w w. j av a 2 s. c o m*/ return 2; } Job job = new Job(getConf()); job.setJobName("MultiFileWordCount"); job.setJarByClass(MultiFileWordCount.class); //set the InputFormat of the job to our InputFormat job.setInputFormatClass(CombineFileTextInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); //use the defined mapper job.setMapperClass(MapClass.class); //use the WordCount Reducer job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:FlightData.Airline.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: Airline <in> [<in>...] <out>"); System.exit(2);/*from ww w .j a v a 2s. c o m*/ } Job job = Job.getInstance(conf, "Airline On Schedule"); job.setJarByClass(Airline.class); job.setMapperClass(OnScheduleMapper.class); job.setCombinerClass(OnScheduleCombiner.class); job.setReducerClass(OnScheduleReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:FlightData.FlightCancelled.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: Flight Cancalled <in> [<in>...] <out>"); System.exit(2);/*from w w w. j av a 2 s .c o m*/ } Job job = Job.getInstance(conf, "Flight Cancalled"); job.setJarByClass(FlightCancelled.class); job.setMapperClass(FlightCancelledMapper.class); job.setCombinerClass(FlightCancelledReducer.class); job.setReducerClass(FlightCancelledReducer.class); job.setMapOutputKeyClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:FlightData.Taxi.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: Taxi <in> [<in>...] <out>"); System.exit(2);// w w w.j a v a 2s. c o m } Job job = Job.getInstance(conf, "Taxi Time"); job.setJarByClass(Taxi.class); job.setMapperClass(TaxiMapper.class); job.setCombinerClass(TaxiCombiner.class); job.setReducerClass(TaxiReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:flink.applications.model.fraud.prepare.MarkovStateTransitionModel.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Markov tate transition model"; job.setJobName(jobName);/*from www. j a v a 2 s .c om*/ job.setJarByClass(MarkovStateTransitionModel.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration(), "avenir"); job.setMapperClass(StateTransitionMapper.class); job.setReducerClass(StateTransitionReducer.class); job.setCombinerClass(StateTransitionCombiner.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1)); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:fm.last.darling.ZohmgProgram.java
License:Apache License
public int start(String input) throws Exception { Path path = new Path(input); // TODO: read table/dataset from environment. String table = "zohmg"; Job job = new Job(); job.setJobName("zohmg!"); FileInputFormat.addInputPath(job, path); Path output = new Path("yeah"); FileOutputFormat.setOutputPath(job, output); // input//from ww w . j a v a2 s . co m job.setInputFormatClass(TextInputFormat.class); // wrapper job.setMapperClass(MapperWrapper.class); job.setMapOutputKeyClass(NSpacePoint.class); job.setMapOutputValueClass(IntWritable.class); // output job.setCombinerClass(ZohmgCombiner.class); job.setReducerClass(ZohmgReducer.class); job.setOutputFormatClass(TableOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Put.class); //job.set(TableOutputFormat.OUTPUT_TABLE, table); return job.waitForCompletion(true) ? 0 : 1; }
From source file:fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java
License:LGPL
/** * Create JobConf object for HTSeq-count. * @param context the task context/*ww w . j a va2s . co m*/ * @param alignmentsData alignment data * @param featureAnnotationData feature annotations data * @param gtfFormat true if the annotation file is in GTF format * @param genomeDescriptionData genome description data * @param genomicType genomic type * @param attributeId attributeId * @param splitAttributeValues split attribute values * @param stranded stranded mode * @param overlapMode overlap mode * @param removeAmbiguousCases true to remove ambiguous cases * @throws IOException if an error occurs while creating job * @throws BadBioEntryException if an entry of the annotation file is invalid * @throws EoulsanException if the job creating fails */ private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context, final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat, final Data genomeDescriptionData, final Data outData, final String genomicType, final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded, final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat) throws IOException, BadBioEntryException, EoulsanException { final Configuration jobConf = new Configuration(parentConf); // Get input DataFile DataFile inputDataFile = alignmentsData.getDataFile(); if (inputDataFile == null) { throw new IOException("No input file found."); } final String dataFileSource; if (tsamFormat) { dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION; } else { dataFileSource = inputDataFile.getSource(); } // Set input path final Path inputPath = new Path(dataFileSource); // Get annotation DataFile final DataFile annotationDataFile = featureAnnotationData.getDataFile(); // Get output file final DataFile outFile = outData.getDataFile(); // Get temporary file final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp"); getLogger().fine("sample: " + alignmentsData.getName()); getLogger().fine("inputPath.getName(): " + inputPath.getName()); getLogger().fine("annotationDataFile: " + annotationDataFile.getSource()); getLogger().fine("outFile: " + outFile.getSource()); getLogger().fine("tmpFile: " + tmpFile.getSource()); jobConf.set("mapred.child.java.opts", "-Xmx1024m"); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP); // Set Genome description path final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile(); jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource()); // Set the "stranded" parameter jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName()); // Set the "overlap mode" parameter jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName()); // Set the "remove ambiguous cases" parameter jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases); final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile()); getLogger().info("featuresIndexPath: " + featuresIndexPath); // Create serialized feature index if (!PathUtils.isFile(featuresIndexPath, jobConf)) { final Locker lock = createZookeeperLock(parentConf, context); lock.lock(); createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId, splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf); lock.unlock(); } // Create the job and its name final Job job = Job.getInstance(jobConf, "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName() + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")"); // Set the path to the features index job.addCacheFile(featuresIndexPath.toUri()); // Set the jar job.setJarByClass(ExpressionHadoopModule.class); // Set input path FileInputFormat.setInputPaths(job, inputPath); // Set input format job.setInputFormatClass(SAMInputFormat.class); // Set the mapper class job.setMapperClass(HTSeqCountMapper.class); // Set the combiner class job.setCombinerClass(HTSeqCountReducer.class); // Set the reducer class job.setReducerClass(HTSeqCountReducer.class); // Set the output format job.setOutputFormatClass(ExpressionOutputFormat.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(LongWritable.class); // Set output path FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource())); return job; }