Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:eu.scape_project.tb.wc.archd.mapreduce.TikaCharacterisation.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    System.out.println(conf.get("mapreduce.job.user.classpath.first"));

    for (int i = 0; i < args.length; i++) {
        System.out.println("Arg" + i + ": " + args[i]);
    }//from w w w.  j av a  2 s . c om

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setJarByClass(TikaCharacterisation.class);
    job.setJobName(name);

    //*** Set interface data types
    // We are using LONG because this value can become very large on huge archives.
    // In order to use the combiner function, also the map output needs to be a LONG.
    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    //*** Set up the mapper, combiner and reducer
    job.setMapperClass(TikaMap.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);

    //*** Set the MAP output compression
    //job.getConfiguration().set("mapred.compress.map.output", "true");

    //*** Set input / output format
    job.setInputFormatClass(ArcInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    //*** Start the job and wait for it
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}

From source file:Example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length != 2) {
        System.err.println("Usage: wordcount <HDFS input file> <HDFS output file>");
        System.exit(2);/* w w  w  . j ava 2 s  .c  om*/
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setNumReduceTasks(2);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.FixMate.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("fixmate :: WORKDIR not given.");
        return 3;
    }//from w ww .  j a  v a 2  s .  com
    if (args.size() == 1) {
        System.err.println("fixmate :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue(
            stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    final boolean globalSort = parser.getBoolean(sortOpt);
    if (globalSort)
        Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname);

    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        if (globalSort)
            Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(FixMate.class);
        job.setMapperClass(FixMateMapper.class);
        job.setReducerClass(FixMateReducer.class);

        if (!parser.getBoolean(noCombinerOpt))
            job.setCombinerClass(FixMateReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(AnySAMInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        if (globalSort) {
            job.setPartitionerClass(TotalOrderPartitioner.class);

            System.out.println("fixmate :: Sampling...");
            t.start();

            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                    new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                            Math.max(100, reduceTasks)));

            System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());
        }

        job.submit();

        System.out.println("fixmate :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("fixmate :: Job failed.");
            return 4;
        }

        System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("fixmate :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate");
        } catch (IOException e) {
            System.err.printf("fixmate :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:fire.util.fileformats.combinetextfileinputformat.MultiFileWordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 2) {
        printUsage();/* w  w  w. j av  a 2 s.  c  o  m*/
        return 2;
    }

    Job job = new Job(getConf());
    job.setJobName("MultiFileWordCount");
    job.setJarByClass(MultiFileWordCount.class);

    //set the InputFormat of the job to our InputFormat
    job.setInputFormatClass(CombineFileTextInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(IntWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:FlightData.Airline.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: Airline <in> [<in>...] <out>");
        System.exit(2);/*from   ww  w  .j  a v a  2s.  c  o  m*/
    }
    Job job = Job.getInstance(conf, "Airline On Schedule");
    job.setJarByClass(Airline.class);
    job.setMapperClass(OnScheduleMapper.class);
    job.setCombinerClass(OnScheduleCombiner.class);
    job.setReducerClass(OnScheduleReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:FlightData.FlightCancelled.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: Flight Cancalled <in> [<in>...] <out>");
        System.exit(2);/*from w w  w. j  av  a 2 s  .c  o m*/
    }
    Job job = Job.getInstance(conf, "Flight Cancalled");
    job.setJarByClass(FlightCancelled.class);
    job.setMapperClass(FlightCancelledMapper.class);
    job.setCombinerClass(FlightCancelledReducer.class);
    job.setReducerClass(FlightCancelledReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:FlightData.Taxi.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: Taxi <in> [<in>...] <out>");
        System.exit(2);// w  w w.j a  v  a  2s.  c  o  m
    }
    Job job = Job.getInstance(conf, "Taxi Time");
    job.setJarByClass(Taxi.class);
    job.setMapperClass(TaxiMapper.class);
    job.setCombinerClass(TaxiCombiner.class);
    job.setReducerClass(TaxiReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:flink.applications.model.fraud.prepare.MarkovStateTransitionModel.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Markov tate transition model";
    job.setJobName(jobName);/*from  www.  j  a v  a  2  s .c  om*/

    job.setJarByClass(MarkovStateTransitionModel.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "avenir");
    job.setMapperClass(StateTransitionMapper.class);
    job.setReducerClass(StateTransitionReducer.class);
    job.setCombinerClass(StateTransitionCombiner.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:fm.last.darling.ZohmgProgram.java

License:Apache License

public int start(String input) throws Exception {
    Path path = new Path(input);

    // TODO: read table/dataset from environment.
    String table = "zohmg";

    Job job = new Job();

    job.setJobName("zohmg!");
    FileInputFormat.addInputPath(job, path);

    Path output = new Path("yeah");
    FileOutputFormat.setOutputPath(job, output);

    // input//from   ww w .  j  a v  a2 s  . co  m
    job.setInputFormatClass(TextInputFormat.class);
    // wrapper
    job.setMapperClass(MapperWrapper.class);
    job.setMapOutputKeyClass(NSpacePoint.class);
    job.setMapOutputValueClass(IntWritable.class);
    // output
    job.setCombinerClass(ZohmgCombiner.class);
    job.setReducerClass(ZohmgReducer.class);
    job.setOutputFormatClass(TableOutputFormat.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Put.class);
    //job.set(TableOutputFormat.OUTPUT_TABLE, table);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java

License:LGPL

/**
   * Create JobConf object for HTSeq-count.
   * @param context the task context/*ww  w . j  a  va2s . co  m*/
   * @param alignmentsData alignment data
   * @param featureAnnotationData feature annotations data
   * @param gtfFormat true if the annotation file is in GTF format
   * @param genomeDescriptionData genome description data
   * @param genomicType genomic type
   * @param attributeId attributeId
   * @param splitAttributeValues split attribute values
   * @param stranded stranded mode
   * @param overlapMode overlap mode
   * @param removeAmbiguousCases true to remove ambiguous cases
   * @throws IOException if an error occurs while creating job
   * @throws BadBioEntryException if an entry of the annotation file is invalid
   * @throws EoulsanException if the job creating fails
   */
  private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context,
          final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat,
          final Data genomeDescriptionData, final Data outData, final String genomicType,
          final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded,
          final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat)
          throws IOException, BadBioEntryException, EoulsanException {

      final Configuration jobConf = new Configuration(parentConf);

      // Get input DataFile
      DataFile inputDataFile = alignmentsData.getDataFile();

      if (inputDataFile == null) {
          throw new IOException("No input file found.");
      }

      final String dataFileSource;

      if (tsamFormat) {
          dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION;
      } else {
          dataFileSource = inputDataFile.getSource();
      }

      // Set input path
      final Path inputPath = new Path(dataFileSource);

      // Get annotation DataFile
      final DataFile annotationDataFile = featureAnnotationData.getDataFile();

      // Get output file
      final DataFile outFile = outData.getDataFile();

      // Get temporary file
      final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp");

      getLogger().fine("sample: " + alignmentsData.getName());
      getLogger().fine("inputPath.getName(): " + inputPath.getName());
      getLogger().fine("annotationDataFile: " + annotationDataFile.getSource());
      getLogger().fine("outFile: " + outFile.getSource());
      getLogger().fine("tmpFile: " + tmpFile.getSource());

      jobConf.set("mapred.child.java.opts", "-Xmx1024m");

      // Set counter group
      jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

      // Set Genome description path
      final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile();
      jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource());

      // Set the "stranded" parameter
      jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName());

      // Set the "overlap mode" parameter
      jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName());

      // Set the "remove ambiguous cases" parameter
      jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases);

      final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());

      getLogger().info("featuresIndexPath: " + featuresIndexPath);

      // Create serialized feature index
      if (!PathUtils.isFile(featuresIndexPath, jobConf)) {

          final Locker lock = createZookeeperLock(parentConf, context);

          lock.lock();

          createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId,
                  splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf);

          lock.unlock();
      }

      // Create the job and its name
      final Job job = Job.getInstance(jobConf,
              "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName()
                      + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId
                      + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")");

      // Set the path to the features index
      job.addCacheFile(featuresIndexPath.toUri());

      // Set the jar
      job.setJarByClass(ExpressionHadoopModule.class);

      // Set input path
      FileInputFormat.setInputPaths(job, inputPath);

      // Set input format
      job.setInputFormatClass(SAMInputFormat.class);

      // Set the mapper class
      job.setMapperClass(HTSeqCountMapper.class);

      // Set the combiner class
      job.setCombinerClass(HTSeqCountReducer.class);

      // Set the reducer class
      job.setReducerClass(HTSeqCountReducer.class);

      // Set the output format
      job.setOutputFormatClass(ExpressionOutputFormat.class);

      // Set the output key class
      job.setOutputKeyClass(Text.class);

      // Set the output value class
      job.setOutputValueClass(LongWritable.class);

      // Set output path
      FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource()));

      return job;
  }