Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:com.mongodb.hadoop.examples.wordcount.split.WordCountSplitTest.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    boolean useQuery = false;
    for (int i = 0; i < args.length; i++) {
        final String argi = args[i];
        if (argi.equals("--use-query"))
            useQuery = true;/*from   www .j  a va 2 s. c o m*/
        else {
            throw new IllegalArgumentException(argi);
        }
    }

    if (useQuery) {
        //NOTE: must do this BEFORE Job is created
        final MongoConfig mongo_conf = new MongoConfig(conf);
        com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
        query.put("num", new com.mongodb.BasicDBObject(Collections.singletonMap("$mod", new int[] { 2, 0 })));
        System.out.println(" --- setting query on num");
        mongo_conf.setQuery(query);
        System.out.println(" --- query is: " + mongo_conf.getQuery());
    }

    final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf);
    if (outputUri == null)
        throw new IllegalStateException("output uri is not set");
    if (MongoConfigUtil.getInputURI(conf) == null)
        throw new IllegalStateException("input uri is not set");
    final String outputCollectionName = outputUri.getCollection();

    final Job job = new Job(conf, "word count " + outputCollectionName);

    job.setJarByClass(WordCountSplitTest.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    final long start = System.currentTimeMillis();
    System.out
            .println(" ----------------------- running test " + outputCollectionName + " --------------------");
    try {
        boolean result = job.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.err.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
        return 1;
    }
    final long end = System.currentTimeMillis();
    final float seconds = ((float) (end - start)) / 1000;
    java.text.NumberFormat nf = java.text.NumberFormat.getInstance();
    nf.setMaximumFractionDigits(3);
    System.out.println("finished run in " + nf.format(seconds) + " seconds");

    com.mongodb.Mongo m = new com.mongodb.Mongo(outputUri);
    com.mongodb.DB db = m.getDB(outputUri.getDatabase());
    com.mongodb.DBCollection coll = db.getCollection(outputCollectionName);
    com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
    query.put("_id", "the");
    com.mongodb.DBCursor cur = coll.find(query);
    if (!cur.hasNext())
        System.out.println("FAILURE: could not find count of \'the\'");
    else
        System.out.println("'the' count: " + cur.next());

    return 0; //is the return value supposed to be the program exit code?

    //        if (! result)
    //           System.exit(  1 );
}

From source file:com.mongodb.hadoop.examples.wordcount.split.WordCountSplitTest.java

License:Apache License

private final static void test(boolean useShards, boolean useChunks, Boolean slaveok, boolean useQuery)
        throws Exception {
    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost:30000/test.lines");
    conf.setBoolean(MongoConfigUtil.SPLITS_USE_SHARDS, useShards);
    conf.setBoolean(MongoConfigUtil.SPLITS_USE_CHUNKS, useChunks);

    if (useQuery) {
        //NOTE: must do this BEFORE Job is created
        final MongoConfig mongo_conf = new MongoConfig(conf);
        com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
        query.put("num", new com.mongodb.BasicDBObject(Collections.singletonMap("$mod", new int[] { 2, 0 })));
        System.out.println(" --- setting query on num");
        mongo_conf.setQuery(query);/*from  w  w  w  .java 2 s .  co m*/
        System.out.println(" --- query is: " + mongo_conf.getQuery());
    }

    String output_table = null;
    if (useChunks) {
        if (useShards)
            output_table = "with_shards_and_chunks";
        else
            output_table = "with_chunks";
    } else {
        if (useShards)
            output_table = "with_shards";
        else
            output_table = "no_splits";
    }
    if (slaveok != null) {
        output_table += "_" + slaveok;
    }
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost:30000/test." + output_table);
    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count " + output_table);

    job.setJarByClass(WordCountSplitTest.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    final long start = System.currentTimeMillis();
    System.out.println(" ----------------------- running test " + output_table + " --------------------");
    try {
        boolean result = job.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.out.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
    }
    final long end = System.currentTimeMillis();
    final float seconds = ((float) (end - start)) / 1000;
    java.text.NumberFormat nf = java.text.NumberFormat.getInstance();
    nf.setMaximumFractionDigits(3);
    System.out.println("finished run in " + nf.format(seconds) + " seconds");

    com.mongodb.Mongo m = new com.mongodb.Mongo(
            new com.mongodb.MongoURI("mongodb://localhost:30000/?slaveok=true"));
    com.mongodb.DB db = m.getDB("test");
    com.mongodb.DBCollection coll = db.getCollection(output_table);
    com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
    query.put("_id", "the");
    com.mongodb.DBCursor cur = coll.find(query);
    if (!cur.hasNext())
        System.out.println("FAILURE: could not find count of \'the\'");
    else
        System.out.println("'the' count: " + cur.next());

    //        if (! result)
    //           System.exit(  1 );
}

From source file:com.mongodb.hadoop.examples.wordcount.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost/test.in");
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/test.out");
    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count");

    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mongodb.hadoop.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**//from   w  w  w  . j a  v  a2s  .  c om
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    LOG.debug("Mapper Class: " + mapper);
    LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Configures the MapReduce combiner for the job.
 *
 * @param job The Hadoop MR job./*from   w  w  w .j a  v  a 2s .co m*/
 */
protected void configureCombiner(Job job) {
    final FijiReducer<?, ?, ?, ?> combiner = getCombiner();
    if (null == combiner) {
        LOG.debug("No combiner provided.");
        return;
    }
    if (combiner instanceof Configurable) {
        ((Configurable) combiner).setConf(job.getConfiguration());
    }
    job.setCombinerClass(((Reducer<?, ?, ?, ?>) combiner).getClass());
}

From source file:com.mozilla.socorro.hadoop.CrashReportJob.java

License:LGPL

/**
 * @param args/*from   w ww.  j av  a2  s  .  co m*/
 * @return
 * @throws IOException
 * @throws ParseException
 */
public static Job initJob(String jobName, Configuration conf, Class<?> mainClass,
        Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass,
        Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns,
        Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath)
        throws IOException, ParseException {
    // Set both start/end time and start/stop row
    Calendar startCal = Calendar.getInstance();
    Calendar endCal = Calendar.getInstance();

    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

    String startDateStr = conf.get(START_DATE);
    String endDateStr = conf.get(END_DATE);
    if (!StringUtils.isBlank(startDateStr)) {
        startCal.setTime(sdf.parse(startDateStr));
    }
    if (!StringUtils.isBlank(endDateStr)) {
        endCal.setTime(sdf.parse(endDateStr));
    }

    conf.setLong(START_TIME, startCal.getTimeInMillis());
    conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE));

    Job job = new Job(conf);
    job.setJobName(jobName);
    job.setJarByClass(mainClass);

    // input table configuration
    Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false);
    MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass,
            keyOut, valueOut, job);

    if (combinerClass != null) {
        job.setCombinerClass(combinerClass);
    }

    if (reducerClass != null) {
        job.setReducerClass(reducerClass);
    } else {
        job.setNumReduceTasks(0);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    return job;
}

From source file:com.mvdb.platform.scratch.action.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    logger.error("error1");
    logger.warn("warning1");
    logger.info("info1");
    logger.debug("debug1");
    logger.trace("trace1");

    ILoggerFactory lc = LoggerFactory.getILoggerFactory();
    System.err.println("lc:" + lc);
    // print logback's internal status
    //StatusPrinter.print(lc);

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs != null) {
        for (String arg : otherArgs) {
            System.out.println(arg);
        }/*from  ww w . j av a 2 s  . co m*/
    }
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mycompany.keywordsearch.KeywordSearch.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, String.valueOf(true));
    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    System.out.print("Keyword:\t");
    conf.set(KEYWORD, in.readLine());/*from  ww  w  .  ja  v a  2 s . co m*/
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(KeywordSearch.class);
    job.setInputFormatClass(TextInputFormatV2.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    clearOutput(conf, output);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mycompany.searcher.Searcher.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    System.out.print("Please input a keyword:\t");
    conf.set(KEYWORD, in.readLine());//from ww  w.  j  a  v a2s.  c om
    conf.set(MINIMUM, args[2]);

    Job job = Job.getInstance(conf, "keyword search");

    job.setJarByClass(Searcher.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    clearOutput(conf, new Path(args[1]));
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
    TimeUnit.SECONDS.sleep(1);

    in = new BufferedReader(
            new InputStreamReader(FileSystem.get(conf).open(new Path(args[1] + "/part-r-00000")), "UTF-8"));
    String line;
    HashMap<String, Integer> map = new HashMap();
    while ((line = in.readLine()) != null) {
        StringTokenizer tok = new StringTokenizer(line);
        map.put(tok.nextToken(), new Integer(tok.nextToken()));
    }

    List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());
    Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
        public int compare(Map.Entry<String, Integer> entry1, Map.Entry<String, Integer> entry2) {
            return (entry2.getValue() - entry1.getValue());
        }
    });
    for (Map.Entry<String, Integer> entry : list) {
        in = new BufferedReader(
                new InputStreamReader(FileSystem.get(conf).open(new Path(entry.getKey())), "UTF-8"));
        System.out.println("\n" + in.readLine());
        System.out.println("\n" + in.readLine() + ":" + entry.getValue() + "\n");
    }
}

From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from  w  ww  .  j  a v  a2s. com
    addOutputOption();
    addOption(LABELS, "l", "comma-separated list of labels to include in training", false);

    addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, ""));
    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
            buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
        HadoopUtil.delete(getConf(), getTempPath());
    }
    Path labPath;
    String labPathStr = getOption(LABEL_INDEX);
    if (labPathStr != null) {
        labPath = new Path(labPathStr);
    } else {
        labPath = getTempPath(LABEL_INDEX);
    }
    long labelSize = createLabelIndex(labPath);
    float alphaI = Float.parseFloat(getOption(ALPHA_I));
    boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY));

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());

    //add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS),
            SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class,
            VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //sum up all the weights from the previous step, per label and per feature
    Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS),
            SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());

    //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors --
    // TODO: add reference here to the part of the Rennie paper that discusses this
    Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS),
            SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    thetaSummer.setCombinerClass(VectorSumReducer.class);
    thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
    thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
    /* TODO(robinanil): Enable this when thetanormalization works.
    succeeded = thetaSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }*/

    //validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(getOutputPath(), getConf());

    return 0;
}