Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:io.fluo.stress.trie.Init.java

License:Apache License

private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Init.class);

    job.setJobName(Init.class.getName() + "_load");

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
    job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));

    job.setMapperClass(InitMapper.class);
    job.setCombinerClass(InitCombiner.class);
    job.setReducerClass(InitReducer.class);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);

    job.setPartitionerClass(RangePartitioner.class);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    Connector conn = AccumuloUtil.getConnector(props);

    Path splitsPath = new Path(tmp, "splits.txt");

    Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);

    RangePartitioner.setSplitFile(job, splitsPath.toString());
    job.setNumReduceTasks(splits1.size() + 1);

    Path outPath = new Path(tmp, "out");
    AccumuloFileOutputFormat.setOutputPath(job, outPath);

    boolean success = job.waitForCompletion(true);

    if (success) {
        Path failPath = new Path(tmp, "failures");
        fs.mkdirs(failPath);//from   www .  ja  v a 2s  .c  om
        conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(),
                failPath.toString(), false);
    }
    return success ? 0 : 1;
}

From source file:io.gzinga.hadoop.TestSplittableGZipCodec.java

License:Apache License

@Test
public void testSplittableGZipCodec() {
    try {/*from  w  w  w  . ja  v  a  2  s .  c o  m*/
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "file:///");
        FileSystem fs = FileSystem.get(conf);
        fs.mkdirs(new Path("target/test"));
        GZipOutputStreamRandomAccess gzip = new GZipOutputStreamRandomAccess(
                fs.create(new Path("target/test/testfile1.gz")));
        String str = "This is line\n";
        for (int i = 1; i <= 10000; i++) {
            gzip.write(str.getBytes());
            if (i % 100 == 0) {
                gzip.addOffset(i / 100l);
            }
        }
        Assert.assertEquals(gzip.getOffsetMap().size(), 100);
        gzip.close();

        conf.set("mapreduce.framework.name", "local");
        conf.set("io.compression.codecs", "io.gzinga.hadoop.SplittableGZipCodec");
        conf.set("mapreduce.input.fileinputformat.split.maxsize", "20000");
        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCount.TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("target/test/testfile1.gz"));
        FileOutputFormat.setOutputPath(job, new Path("target/test/testfile2"));
        job.waitForCompletion(true);

        BufferedReader br = new BufferedReader(
                new InputStreamReader(fs.open(new Path("target/test/testfile2/part-r-00000"))));
        Assert.assertEquals("This\t10000", br.readLine());
        Assert.assertEquals("is\t10000", br.readLine());
        Assert.assertEquals("line\t10000", br.readLine());
        br.close();
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail();
    } finally {
        FileUtil.fullyDelete(new File("target/test/testfile2"));
        FileUtil.fullyDelete(new File("target/test/testfile1.gz"));
    }
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.HadoopJob.java

License:Open Source License

protected Job mapReduce(Path input, Path output, Class inputFormatClass, Class outputFormatClass,
        Class mapperClass, Class mapperKeyClass, Class mapperValueClass, Class reducerClass,
        Class reducerKeyClass, Class reducerValueClass, boolean combinable) throws IOException {

    Job job = map(input, output, inputFormatClass, outputFormatClass, mapperClass, mapperKeyClass,
            mapperValueClass);//from w  ww .  j  av a  2  s .co  m

    job.setReducerClass(reducerClass);
    job.setOutputKeyClass(reducerKeyClass);
    job.setOutputValueClass(reducerValueClass);

    if (combinable) {
        job.setCombinerClass(reducerClass);
    }

    return job;
}

From source file:it.crs4.seal.recab.RecabTable.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    LOG.info("starting");

    RecabTableOptionParser parser = new RecabTableOptionParser();
    parser.parse(getConf(), args);/*from w ww. j av a2s .c o m*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    // must be called before creating the job, since the job
    // *copies* the Configuration.
    distributeVariantsFile(parser);

    // Create a Job using the processed conf
    Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0));

    job.setJarByClass(RecabTable.class);
    job.setInputFormatClass(FormatNameMap
            .getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam")));
    LOG.info("Using input format " + job.getInputFormatClass().getName());

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObservationCount.class);

    job.setCombinerClass(Combiner.class);

    job.setReducerClass(Red.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // output
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        return 0;
    } else {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    }
}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * /*from w ww.  j  a v  a 2  s.  c o m*/
 * Count the frequencies of items
 * 
 * @param params
 * @param conf
 */
public static void startParallelCounting(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());

    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input;
    Integer enableDiscretization = new Integer(params.get(ENABLE_DISCRETIZATION));
    if (enableDiscretization.compareTo(new Integer(1)) == 0) {
        input = new Path(params.get(OUTPUT), DISC);
    } else {
        input = new Path(params.get(INPUT));
    }

    Job job = new Job(conf, "Parallel Counting driver running over input: " + input);
    job.setJarByClass(ARM.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), ITEM_FREQ);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelCountingMapper.class);
    job.setCombinerClass(ParallelCountingReducer.class);
    job.setReducerClass(ParallelCountingReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Run the Parallel FPGrowth Map/Reduce job to calculate the Top K features
 * of group dependent shards// www  . j a v a  2s .  c  o  m
 */
public static void startParallelFPGrowth(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");
    Path input;
    Integer enableDiscretization = new Integer(params.get(ENABLE_DISCRETIZATION));
    if (enableDiscretization.compareTo(new Integer(1)) == 0) {
        input = new Path(params.get(OUTPUT), DISC);
    } else {
        input = new Path(params.get(INPUT));
    }
    Job job = new Job(conf, "PFP Growth driver running over input" + input);
    job.setJarByClass(ARM.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(TransactionTree.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), FPGROWTH);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelFPGrowthMapper.class);
    job.setCombinerClass(ParallelFPGrowthCombiner.class);
    job.setReducerClass(ParallelFPGrowthReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:ivory.core.preprocess.ComputeGlobalTermStatistics.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();

    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int reduceTasks = 10;

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        LOG.info("index path doesn't existing: skipping!");
        return 0;
    }/*from ww  w . j  ava 2 s.  com*/

    if (!fs.exists(new Path(termDocVectorsPath))) {
        LOG.info("term doc vectors path doesn't existing: skipping!");
        return 0;
    }

    LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        LOG.info("TermDfCf directory exist: skipping!");
        return 0;
    }

    Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(ComputeGlobalTermStatistics.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());

    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
}

From source file:ivory.preprocess.GetTermCount2.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();

    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        LOG.info("index path doesn't existing: skipping!");
        return 0;
    }/*from w  ww .  j  a  va2  s .c  o  m*/

    LOG.info("PowerTool: GetTermCount2");
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        LOG.info("TermDfCf directory exist: skipping!");
        return 0;
    }

    Job job = new Job(getConf(), "GetTermCount2:" + collectionName);
    job.setJarByClass(GetTermCount2.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());

    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
}

From source file:javaapplication1.Object1.java

public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }/* w  w w  .  j  a  v  a 2 s .co  m*/

From source file:jdamasceno.hadoop.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost/tweets.tweets");
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/tweets.count");
    MongoConfigUtil.setSplitSize(conf, 4);

    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count");

    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}