Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:com.datasalt.utils.mapred.counter.MapRedCounter.java

License:Apache License

/**
 * Builds a MapRedCounterJob that counts the number items occurrences per each item, the number of distinct items per
 * group and the total Occurrences of each item per group. Then you can add more mappers to that class by calling
 * #addInput()/*from   w w w  . j a v a 2s. c om*/
 */
public static Job buildMapRedCounterJob(String name,
        @SuppressWarnings("rawtypes") Class<? extends OutputFormat> outputFormat, String outPath,
        Configuration conf) throws IOException {

    Job job = buildMapRedCounterJobWithoutCombiner(name, outputFormat, outPath, conf);
    job.setCombinerClass(MapRedCountCombiner.class);

    return job;
}

From source file:com.dipwater.accountAnalyze.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "192.168.1.51:9001");
    conf.set("fs.default.name", "hdfs://192.168.1.51:9000");

    String[] ars = new String[] { "input", "newout" };
    String[] otherArgs = new GenericOptionsParser(conf, ars).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//ww  w .  j a  v a2 s. c o m
    }
    Job job = new Job(conf, "word count");

    File jarFile = EJob.createTempJar("bin");
    EJob.addClasspath("/home/hadoop/hadoop-1.2.1/conf");
    ClassLoader classLoader = EJob.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);
    ((JobConf) job.getConfiguration()).setJar(jarFile.toString());

    //job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.elephantscale.hbase.book.chapter1.SimpleMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: SimpleMR <in> <out>");
        return;//from  w  w  w .j a  v  a  2 s.c  om
    }
    Job job = new Job(conf, "SimpleMR");
    job.setJarByClass(SimpleMR.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "Calculating perplexity for " + modelPath;
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CachingCVB0PerplexityMapper.class);
    job.setMapperClass(CachingCVB0PerplexityMapper.class);
    job.setCombinerClass(DualDoubleSumReducer.class);
    job.setReducerClass(DualDoubleSumReducer.class);
    job.setNumReduceTasks(1);/*from   w w  w .j  a  v  a 2s. c  om*/
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusPath);
    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
    FileOutputFormat.setOutputPath(job, outputPath);
    setModelPaths(job, modelPath);
    HadoopUtil.delete(conf, outputPath);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
    }
    return readPerplexity(conf, modelPath.getParent(), iteration);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
        int iterationNumber, int maxIterations, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations,
            modelInput);//from w w w  .  j  a v  a2 s  . c o  m
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CVB0Driver.class);
    job.setMapperClass(CachingCVB0Mapper.class);
    job.setCombinerClass(VectorSumReducer.class);
    job.setReducerClass(VectorSumReducer.class);
    job.setNumReduceTasks(numReduceTasks);
    job.setOutputKeyClass(Text.class);//0.7IntWritable
    job.setOutputValueClass(VectorWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusInput);
    FileOutputFormat.setOutputPath(job, modelOutput);
    setModelPaths(job, modelInput);
    HadoopUtil.delete(conf, modelOutput);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException(
                String.format("Failed to complete iteration %d stage 1", iterationNumber));
    }
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format//  w ww  .j a v  a2  s  . c  om
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format/*  ww w .ja  v  a 2 s .c om*/
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elixir.hadoop.Chromo.FragmentCoverage.java

License:Apache License

public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);//  w  w  w . j  a v  a2s  . com
    }
    Job job = Job.getInstance(conf, "position");
    job.setJarByClass(FragmentCoverage.class);

    job.setMapperClass(CoverageMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setNumReduceTasks(5);
    job.setMapOutputKeyClass(com.elixir.hadoop.Chromo.SecondrySort.IntPair.class);
    //job.setSpeculativeExecution(true);
    job.setPartitionerClass(ChromoPartitioner.class);
    job.setGroupingComparatorClass(com.elixir.hadoop.Chromo.SecondrySort.FirstGroupingComparator.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);

    job.setOutputValueClass(IntWritable.class);
    //   job.setOutputFormatClass(Text.class);

    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.elixir.hadoop.FragmentCoverage.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);// w ww .  java  2s . c om
    }
    Job job = Job.getInstance(conf, "position");
    job.setJarByClass(FragmentCoverage.class);
    job.setMapperClass(CoverageMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.elixir.hadoop.OddEven.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);/*from   w ww.java 2  s .com*/
    }
    Job job = Job.getInstance(conf, "oddeven");
    job.setJarByClass(OddEven.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}