Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:io.fluo.stress.trie.Init.java

License:Apache License

private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
    Job job = Job.getInstance(getConf());

    job.setJarByClass(Init.class);

    job.setJobName(Init.class.getName() + "_load");

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
    job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));

    job.setMapperClass(InitMapper.class);
    job.setCombinerClass(InitCombiner.class);
    job.setReducerClass(InitReducer.class);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);

    job.setPartitionerClass(RangePartitioner.class);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    Connector conn = AccumuloUtil.getConnector(props);

    Path splitsPath = new Path(tmp, "splits.txt");

    Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);

    RangePartitioner.setSplitFile(job, splitsPath.toString());
    job.setNumReduceTasks(splits1.size() + 1);

    Path outPath = new Path(tmp, "out");
    AccumuloFileOutputFormat.setOutputPath(job, outPath);

    boolean success = job.waitForCompletion(true);

    if (success) {
        Path failPath = new Path(tmp, "failures");
        fs.mkdirs(failPath);//from   www .  ja  v a 2s  .c  om
        conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(),
                failPath.toString(), false);
    }
    return success ? 0 : 1;
}

From source file:io.gzinga.hadoop.TestSplittableGZipCodec.java

License:Apache License

@Test
public void testSplittableGZipCodec() {
    try {/*from  w  w  w  . ja  v  a  2  s .  c o  m*/
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "file:///");
        FileSystem fs = FileSystem.get(conf);
        fs.mkdirs(new Path("target/test"));
        GZipOutputStreamRandomAccess gzip = new GZipOutputStreamRandomAccess(
                fs.create(new Path("target/test/testfile1.gz")));
        String str = "This is line\n";
        for (int i = 1; i <= 10000; i++) {
            gzip.write(str.getBytes());
            if (i % 100 == 0) {
                gzip.addOffset(i / 100l);
            }
        }
        Assert.assertEquals(gzip.getOffsetMap().size(), 100);
        gzip.close();

        conf.set("mapreduce.framework.name", "local");
        conf.set("io.compression.codecs", "io.gzinga.hadoop.SplittableGZipCodec");
        conf.set("mapreduce.input.fileinputformat.split.maxsize", "20000");
        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCount.TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path("target/test/testfile1.gz"));
        FileOutputFormat.setOutputPath(job, new Path("target/test/testfile2"));
        job.waitForCompletion(true);

        BufferedReader br = new BufferedReader(
                new InputStreamReader(fs.open(new Path("target/test/testfile2/part-r-00000"))));
        Assert.assertEquals("This\t10000", br.readLine());
        Assert.assertEquals("is\t10000", br.readLine());
        Assert.assertEquals("line\t10000", br.readLine());
        br.close();
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail();
    } finally {
        FileUtil.fullyDelete(new File("target/test/testfile2"));
        FileUtil.fullyDelete(new File("target/test/testfile1.gz"));
    }
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.HadoopJob.java

License:Open Source License

protected Job mapReduce(Path input, Path output, Class inputFormatClass, Class outputFormatClass,
        Class mapperClass, Class mapperKeyClass, Class mapperValueClass, Class reducerClass,
        Class reducerKeyClass, Class reducerValueClass, boolean combinable) throws IOException {

    Job job = map(input, output, inputFormatClass, outputFormatClass, mapperClass, mapperKeyClass,
            mapperValueClass);//from w  ww .  j  av a  2  s .co  m

    job.setReducerClass(reducerClass);
    job.setOutputKeyClass(reducerKeyClass);
    job.setOutputValueClass(reducerValueClass);

    if (combinable) {
        job.setCombinerClass(reducerClass);
    }

    return job;
}

From source file:it.crs4.seal.recab.RecabTable.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    LOG.info("starting");

    RecabTableOptionParser parser = new RecabTableOptionParser();
    parser.parse(getConf(), args);/*from w ww. j av a2s .c o m*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    // must be called before creating the job, since the job
    // *copies* the Configuration.
    distributeVariantsFile(parser);

    // Create a Job using the processed conf
    Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0));

    job.setJarByClass(RecabTable.class);
    job.setInputFormatClass(FormatNameMap
            .getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam")));
    LOG.info("Using input format " + job.getInputFormatClass().getName());

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObservationCount.class);

    job.setCombinerClass(Combiner.class);

    job.setReducerClass(Red.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // output
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        return 0;
    } else {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    }
}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * /*from w ww.  j  a v  a 2  s.  c o m*/
 * Count the frequencies of items
 * 
 * @param params
 * @param conf
 */
public static void startParallelCounting(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());

    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input;
    Integer enableDiscretization = new Integer(params.get(ENABLE_DISCRETIZATION));
    if (enableDiscretization.compareTo(new Integer(1)) == 0) {
        input = new Path(params.get(OUTPUT), DISC);
    } else {
        input = new Path(params.get(INPUT));
    }

    Job job = new Job(conf, "Parallel Counting driver running over input: " + input);
    job.setJarByClass(ARM.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), ITEM_FREQ);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelCountingMapper.class);
    job.setCombinerClass(ParallelCountingReducer.class);
    job.setReducerClass(ParallelCountingReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Run the Parallel FPGrowth Map/Reduce job to calculate the Top K features
 * of group dependent shards// www  . j a v a  2s .  c  o  m
 */
public static void startParallelFPGrowth(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");
    Path input;
    Integer enableDiscretization = new Integer(params.get(ENABLE_DISCRETIZATION));
    if (enableDiscretization.compareTo(new Integer(1)) == 0) {
        input = new Path(params.get(OUTPUT), DISC);
    } else {
        input = new Path(params.get(INPUT));
    }
    Job job = new Job(conf, "PFP Growth driver running over input" + input);
    job.setJarByClass(ARM.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(TransactionTree.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), FPGROWTH);
    FileOutputFormat.setOutputPath(job, outPath);

    HadoopUtil.delete(conf, outPath);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(ParallelFPGrowthMapper.class);
    job.setCombinerClass(ParallelFPGrowthCombiner.class);
    job.setReducerClass(ParallelFPGrowthReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:ivory.core.preprocess.ComputeGlobalTermStatistics.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();

    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int reduceTasks = 10;

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        LOG.info("index path doesn't existing: skipping!");
        return 0;
    }/*from ww  w . j  ava 2 s.  com*/

    if (!fs.exists(new Path(termDocVectorsPath))) {
        LOG.info("term doc vectors path doesn't existing: skipping!");
        return 0;
    }

    LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        LOG.info("TermDfCf directory exist: skipping!");
        return 0;
    }

    Job job = new Job(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(ComputeGlobalTermStatistics.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());

    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
}

From source file:ivory.preprocess.GetTermCount2.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();

    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        LOG.info("index path doesn't existing: skipping!");
        return 0;
    }/*from w  ww .  j  a  va2  s .c  o  m*/

    LOG.info("PowerTool: GetTermCount2");
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        LOG.info("TermDfCf directory exist: skipping!");
        return 0;
    }

    Job job = new Job(getConf(), "GetTermCount2:" + collectionName);
    job.setJarByClass(GetTermCount2.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());

    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
}

From source file:javaapplication1.Object1.java

public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }/* w  w w  .  j  a  v  a 2 s .co  m*/

From source file:jdamasceno.hadoop.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost/tweets.tweets");
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/tweets.count");
    MongoConfigUtil.setSplitSize(conf, 4);

    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count");

    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}