Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:com.mongodb.hadoop.examples.wordcount.split.WordCountSplitTest.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    boolean useQuery = false;
    for (int i = 0; i < args.length; i++) {
        final String argi = args[i];
        if (argi.equals("--use-query"))
            useQuery = true;/*from   www .j  a va 2 s. c o m*/
        else {
            throw new IllegalArgumentException(argi);
        }
    }

    if (useQuery) {
        //NOTE: must do this BEFORE Job is created
        final MongoConfig mongo_conf = new MongoConfig(conf);
        com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
        query.put("num", new com.mongodb.BasicDBObject(Collections.singletonMap("$mod", new int[] { 2, 0 })));
        System.out.println(" --- setting query on num");
        mongo_conf.setQuery(query);
        System.out.println(" --- query is: " + mongo_conf.getQuery());
    }

    final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf);
    if (outputUri == null)
        throw new IllegalStateException("output uri is not set");
    if (MongoConfigUtil.getInputURI(conf) == null)
        throw new IllegalStateException("input uri is not set");
    final String outputCollectionName = outputUri.getCollection();

    final Job job = new Job(conf, "word count " + outputCollectionName);

    job.setJarByClass(WordCountSplitTest.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    final long start = System.currentTimeMillis();
    System.out
            .println(" ----------------------- running test " + outputCollectionName + " --------------------");
    try {
        boolean result = job.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.err.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
        return 1;
    }
    final long end = System.currentTimeMillis();
    final float seconds = ((float) (end - start)) / 1000;
    java.text.NumberFormat nf = java.text.NumberFormat.getInstance();
    nf.setMaximumFractionDigits(3);
    System.out.println("finished run in " + nf.format(seconds) + " seconds");

    com.mongodb.Mongo m = new com.mongodb.Mongo(outputUri);
    com.mongodb.DB db = m.getDB(outputUri.getDatabase());
    com.mongodb.DBCollection coll = db.getCollection(outputCollectionName);
    com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
    query.put("_id", "the");
    com.mongodb.DBCursor cur = coll.find(query);
    if (!cur.hasNext())
        System.out.println("FAILURE: could not find count of \'the\'");
    else
        System.out.println("'the' count: " + cur.next());

    return 0; //is the return value supposed to be the program exit code?

    //        if (! result)
    //           System.exit(  1 );
}

From source file:com.mongodb.hadoop.examples.wordcount.split.WordCountSplitTest.java

License:Apache License

private final static void test(boolean useShards, boolean useChunks, Boolean slaveok, boolean useQuery)
        throws Exception {
    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost:30000/test.lines");
    conf.setBoolean(MongoConfigUtil.SPLITS_USE_SHARDS, useShards);
    conf.setBoolean(MongoConfigUtil.SPLITS_USE_CHUNKS, useChunks);

    if (useQuery) {
        //NOTE: must do this BEFORE Job is created
        final MongoConfig mongo_conf = new MongoConfig(conf);
        com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
        query.put("num", new com.mongodb.BasicDBObject(Collections.singletonMap("$mod", new int[] { 2, 0 })));
        System.out.println(" --- setting query on num");
        mongo_conf.setQuery(query);/*from  w  w  w  .java 2 s .  co m*/
        System.out.println(" --- query is: " + mongo_conf.getQuery());
    }

    String output_table = null;
    if (useChunks) {
        if (useShards)
            output_table = "with_shards_and_chunks";
        else
            output_table = "with_chunks";
    } else {
        if (useShards)
            output_table = "with_shards";
        else
            output_table = "no_splits";
    }
    if (slaveok != null) {
        output_table += "_" + slaveok;
    }
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost:30000/test." + output_table);
    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count " + output_table);

    job.setJarByClass(WordCountSplitTest.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    final long start = System.currentTimeMillis();
    System.out.println(" ----------------------- running test " + output_table + " --------------------");
    try {
        boolean result = job.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.out.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
    }
    final long end = System.currentTimeMillis();
    final float seconds = ((float) (end - start)) / 1000;
    java.text.NumberFormat nf = java.text.NumberFormat.getInstance();
    nf.setMaximumFractionDigits(3);
    System.out.println("finished run in " + nf.format(seconds) + " seconds");

    com.mongodb.Mongo m = new com.mongodb.Mongo(
            new com.mongodb.MongoURI("mongodb://localhost:30000/?slaveok=true"));
    com.mongodb.DB db = m.getDB("test");
    com.mongodb.DBCollection coll = db.getCollection(output_table);
    com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject();
    query.put("_id", "the");
    com.mongodb.DBCursor cur = coll.find(query);
    if (!cur.hasNext())
        System.out.println("FAILURE: could not find count of \'the\'");
    else
        System.out.println("'the' count: " + cur.next());

    //        if (! result)
    //           System.exit(  1 );
}

From source file:com.mongodb.hadoop.examples.wordcount.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();
    MongoConfigUtil.setInputURI(conf, "mongodb://localhost/test.in");
    MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/test.out");
    System.out.println("Conf: " + conf);

    final Job job = new Job(conf, "word count");

    job.setJarByClass(WordCount.class);

    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mongodb.hadoop.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**//from   w  w  w  . j a  v  a2s  .  c om
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    LOG.debug("Mapper Class: " + mapper);
    LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Configures the MapReduce combiner for the job.
 *
 * @param job The Hadoop MR job./*from   w  w  w .j a  v  a 2s .co m*/
 */
protected void configureCombiner(Job job) {
    final FijiReducer<?, ?, ?, ?> combiner = getCombiner();
    if (null == combiner) {
        LOG.debug("No combiner provided.");
        return;
    }
    if (combiner instanceof Configurable) {
        ((Configurable) combiner).setConf(job.getConfiguration());
    }
    job.setCombinerClass(((Reducer<?, ?, ?, ?>) combiner).getClass());
}

From source file:com.mozilla.socorro.hadoop.CrashReportJob.java

License:LGPL

/**
 * @param args/*from   w ww.  j av  a2  s  .  co m*/
 * @return
 * @throws IOException
 * @throws ParseException
 */
public static Job initJob(String jobName, Configuration conf, Class<?> mainClass,
        Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass,
        Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns,
        Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath)
        throws IOException, ParseException {
    // Set both start/end time and start/stop row
    Calendar startCal = Calendar.getInstance();
    Calendar endCal = Calendar.getInstance();

    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

    String startDateStr = conf.get(START_DATE);
    String endDateStr = conf.get(END_DATE);
    if (!StringUtils.isBlank(startDateStr)) {
        startCal.setTime(sdf.parse(startDateStr));
    }
    if (!StringUtils.isBlank(endDateStr)) {
        endCal.setTime(sdf.parse(endDateStr));
    }

    conf.setLong(START_TIME, startCal.getTimeInMillis());
    conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE));

    Job job = new Job(conf);
    job.setJobName(jobName);
    job.setJarByClass(mainClass);

    // input table configuration
    Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false);
    MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass,
            keyOut, valueOut, job);

    if (combinerClass != null) {
        job.setCombinerClass(combinerClass);
    }

    if (reducerClass != null) {
        job.setReducerClass(reducerClass);
    } else {
        job.setNumReduceTasks(0);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    return job;
}

From source file:com.mvdb.platform.scratch.action.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    logger.error("error1");
    logger.warn("warning1");
    logger.info("info1");
    logger.debug("debug1");
    logger.trace("trace1");

    ILoggerFactory lc = LoggerFactory.getILoggerFactory();
    System.err.println("lc:" + lc);
    // print logback's internal status
    //StatusPrinter.print(lc);

    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs != null) {
        for (String arg : otherArgs) {
            System.out.println(arg);
        }/*from  ww w . j av a 2 s  . co m*/
    }
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mycompany.keywordsearch.KeywordSearch.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, String.valueOf(true));
    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    System.out.print("Keyword:\t");
    conf.set(KEYWORD, in.readLine());/*from  ww  w  .  ja  v a  2 s . co m*/
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(KeywordSearch.class);
    job.setInputFormatClass(TextInputFormatV2.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    clearOutput(conf, output);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mycompany.searcher.Searcher.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    System.out.print("Please input a keyword:\t");
    conf.set(KEYWORD, in.readLine());//from ww  w.  j  a  v a2s.  c om
    conf.set(MINIMUM, args[2]);

    Job job = Job.getInstance(conf, "keyword search");

    job.setJarByClass(Searcher.class);

    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    clearOutput(conf, new Path(args[1]));
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
    TimeUnit.SECONDS.sleep(1);

    in = new BufferedReader(
            new InputStreamReader(FileSystem.get(conf).open(new Path(args[1] + "/part-r-00000")), "UTF-8"));
    String line;
    HashMap<String, Integer> map = new HashMap();
    while ((line = in.readLine()) != null) {
        StringTokenizer tok = new StringTokenizer(line);
        map.put(tok.nextToken(), new Integer(tok.nextToken()));
    }

    List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet());
    Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() {
        public int compare(Map.Entry<String, Integer> entry1, Map.Entry<String, Integer> entry2) {
            return (entry2.getValue() - entry1.getValue());
        }
    });
    for (Map.Entry<String, Integer> entry : list) {
        in = new BufferedReader(
                new InputStreamReader(FileSystem.get(conf).open(new Path(entry.getKey())), "UTF-8"));
        System.out.println("\n" + in.readLine());
        System.out.println("\n" + in.readLine() + ":" + entry.getValue() + "\n");
    }
}

From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from  w  ww  .  j  a v  a2s. com
    addOutputOption();
    addOption(LABELS, "l", "comma-separated list of labels to include in training", false);

    addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, ""));
    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
            buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
        HadoopUtil.delete(getConf(), getTempPath());
    }
    Path labPath;
    String labPathStr = getOption(LABEL_INDEX);
    if (labPathStr != null) {
        labPath = new Path(labPathStr);
    } else {
        labPath = getTempPath(LABEL_INDEX);
    }
    long labelSize = createLabelIndex(labPath);
    float alphaI = Float.parseFloat(getOption(ALPHA_I));
    boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY));

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());

    //add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS),
            SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class,
            VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //sum up all the weights from the previous step, per label and per feature
    Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS),
            SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());

    //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors --
    // TODO: add reference here to the part of the Rennie paper that discusses this
    Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS),
            SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    thetaSummer.setCombinerClass(VectorSumReducer.class);
    thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
    thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
    /* TODO(robinanil): Enable this when thetanormalization works.
    succeeded = thetaSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }*/

    //validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(getOutputPath(), getConf());

    return 0;
}