Example usage for org.apache.hadoop.mapreduce Job setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the Reducer for the job.

Usage

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }/*from ww  w . ja  v  a  2 s . c om*/

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}

From source file:cn.itcast.hadoop.mr.wordcount.DBCountPageView.java

License:Apache License

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {

    //?MySql//from  w  w  w .  ja v a  2 s  .  c o m
    String driverClassName = DRIVER_CLASS;
    String url = DB_URL; //??

    //????
    if (args.length > 1) {
        driverClassName = args[0];
        url = args[1];
    }

    //driverClassNameurl??
    initialize(driverClassName, url);

    //hdfs?
    Configuration conf = getConf();

    //??
    DBConfiguration.configureDB(conf, driverClassName, url); //???

    //job
    Job job = Job.getInstance(conf);

    //job??
    job.setJobName("Count Pageviews of URLs");

    //job
    job.setJarByClass(DBCountPageView.class);

    //Map
    job.setMapperClass(PageviewMapper.class);

    //Combiner
    job.setCombinerClass(LongSumReducer.class);

    //reduce
    job.setReducerClass(PageviewReducer.class);

    //DB?
    //   setInput(Job job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String orderBy, String... fieldNames)
    DBInputFormat.setInput(job, AccessRecord.class, "HAccess", null, "url", AccessFieldNames); //?

    //FileOutputFormat.setoutput ?
    DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);//

    //Mapkey?
    job.setMapOutputKeyClass(Text.class);

    //MapValue?
    job.setMapOutputValueClass(LongWritable.class);

    //Reducekey?
    job.setOutputKeyClass(PageviewRecord.class);

    //Reducevalue?
    job.setOutputValueClass(NullWritable.class);

    int ret;//job?????
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;

        boolean correct = verify();
        if (!correct) {
            throw new RuntimeException("Evaluation was not correct!");
        }
    } finally {
        shutdown();
    }
    return ret;
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Create and setup a job */
@SuppressWarnings("deprecation")
private static Job createJob(String name, Configuration conf) throws IOException {
    final Job job = new Job(conf, NAME + "_" + name);
    final Configuration jobconf = job.getConfiguration();
    job.setJarByClass(BaileyBorweinPlouffe.class);

    // setup mapper
    job.setMapperClass(BbpMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // setup reducer
    job.setReducerClass(BbpReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(1);//w  w w  .  j  av a 2s  . co m

    // setup input
    job.setInputFormatClass(BbpInputFormat.class);

    // disable task timeout
    jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0);

    // do not use speculative execution
    jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    return job;
}

From source file:cn.jpush.hdfs.mr.example.WordMedian.java

License:Apache License

public int run(String[] args) throws Exception {
    long random = new Random().nextLong();
    log.info("random -> " + random);
    args = new String[] { String.format(ConfigUtils.HDFS.WORDCOUNT_IN, "word.txt"),
            String.format(ConfigUtils.HDFS.WORDCOUNT_OUT, random) };
    setConf(new Configuration());
    Configuration conf = getConf();

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "word median");
    job.setJarByClass(WordMedian.class);
    job.setMapperClass(WordMedianMapper.class);
    job.setCombinerClass(WordMedianReducer.class);
    job.setReducerClass(WordMedianReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean result = job.waitForCompletion(true);

    // Wait for JOB 1 -- get middle value to check for Median

    long totalWords = job.getCounters().getGroup(TaskCounter.class.getCanonicalName())
            .findCounter("MAP_OUTPUT_RECORDS", "Map output records").getValue();
    int medianIndex1 = (int) Math.ceil((totalWords / 2.0));
    int medianIndex2 = (int) Math.floor((totalWords / 2.0));

    median = readAndFindMedian(args[1], medianIndex1, medianIndex2, conf);

    return (result ? 0 : 1);
}

From source file:cn.lhfei.hadoop.ch02.MaxTemperature.java

License:Apache License

public static void main(String[] args) {

    log.debug("Logging ... ");

    if (args.length != 2) {
        System.err.println("Usage: MaxTemperature <input path> <output path>");
        System.exit(-1);//from www. ja  v a  2 s .  co  m
    }

    try {
        Job job = new Job();
        job.setJarByClass(MaxTemperature.class);
        job.setJobName("Max temperature");

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        /*FileInputFormat.addInputPath(job, new Path(INPUT));
        FileOutputFormat.setOutputPath(job, new Path(OUTPUT));*/

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setReducerClass(MaxTemperatureReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    } catch (IllegalStateException e) {
        log.error(e.getMessage(), e);
    } catch (IllegalArgumentException e) {
        log.error(e.getMessage(), e);
    } catch (ClassNotFoundException e) {
        log.error(e.getMessage(), e);
    } catch (IOException e) {
        log.error(e.getMessage(), e);
    } catch (InterruptedException e) {
        log.error(e.getMessage(), e);
    }
}

From source file:cn.lhfei.hadoop.ch04.MaxTemperatureWithCompression.java

License:Apache License

public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage: MaxTemperatureWithCompression <input path> " + "<output path>");
        System.exit(-1);//from ww  w. j a  v  a 2 s.co m
    }

    try {
        Job job = new Job();
        job.setJarByClass(MaxTemperatureWithCompression.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setCombinerClass(MaxTemperatureReducer.class);
        job.setReducerClass(MaxTemperatureReducer.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    } catch (IOException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }

}

From source file:cn.lhfei.hadoop.ch05.v2.MaxTemperatureDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*  w w  w  .  j ava  2  s.c  om*/

    Job job = new Job(getConf(), "Max temperature");
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(MaxTemperatureMapper.class);
    job.setCombinerClass(MaxTemperatureReducer.class);
    job.setReducerClass(MaxTemperatureReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java

License:Apache License

private void runMR(File inputDir, File outputDir, long startTime, long endTime, long splitSize, long ttl)
        throws Exception {

    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    StreamInputFormat.setTTL(conf, ttl);
    StreamInputFormat.setStreamPath(conf, inputDir.toURI());
    StreamInputFormat.setTimeRange(conf, startTime, endTime);
    StreamInputFormat.setMaxSplitSize(conf, splitSize);
    job.setInputFormatClass(TestStreamInputFormat.class);

    TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI()));
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setJarByClass(StreamInputFormatTest.class);
    job.setMapperClass(TokenizeMapper.class);
    job.setReducerClass(AggregateReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.waitForCompletion(true);/*  www . j a v a 2 s  . c om*/
}

From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();/*from   w w w. j  ava2 s  .  c  o m*/
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();

    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();

    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();

    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec);

    // we checked at configure time that there is exactly one source
    String sourceName = phaseSpec.getPhase().getSources().iterator().next();

    BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName);
    batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource);
    BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics,
            new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments());
    batchSource.prepareRun(sourceContext);
    runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
    finishers.add(batchSource, sourceContext);

    Map<String, SinkOutput> sinkOutputs = new HashMap<>();

    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }

        BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName);
        batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink);
        MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments());
        batchSink.prepareRun(sinkContext);
        runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
        finishers.add(batchSink, sinkContext);

        sinkOutputs.put(sinkName,
                new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));

    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key",
                    Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }

    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE);
    if (!aggregators.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String aggregatorName = aggregators.iterator().next().getName();
        BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName);
        MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments());
        aggregator.prepareRun(aggregatorContext);
        finishers.add(aggregator, aggregatorContext);

        if (aggregatorContext.getNumPartitions() != null) {
            job.setNumReduceTasks(aggregatorContext.getNumPartitions());
        }
        // if the plugin sets the output key and value class directly, trust them
        Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
        Class<?> outputValClass = aggregatorContext.getGroupValueClass();
        // otherwise, derive it from the plugin's parameters
        if (outputKeyClass == null) {
            outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
        }
        if (outputValClass == null) {
            outputValClass = TypeChecker.getGroupValueClass(aggregator);
        }
        hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName());
        hConf.set(GROUP_VAL_CLASS, outputValClass.getName());
        // in case the classes are not a WritableComparable, but is some common type we support
        // for example, a String or a StructuredRecord
        WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName());
        // if the conversion is null, it means the user is using their own object.
        if (writableConversion != null) {
            outputKeyClass = writableConversion.getWritableClass();
        }
        writableConversion = WritableConversions.getConversion(outputValClass.getName());
        if (writableConversion != null) {
            outputValClass = writableConversion.getWritableClass();
        }
        // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error
        if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.",
                    aggregatorName, outputKeyClass));
        }
        if (!Writable.class.isAssignableFrom(outputValClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.",
                    aggregatorName, outputValClass));
        }

        job.setMapOutputKeyClass(outputKeyClass);
        job.setMapOutputValueClass(outputValClass);
    } else {
        job.setNumReduceTasks(0);
    }

    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}

From source file:co.cask.cdap.examples.clicksandviews.ClicksAndViewsMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
    context.addInput(Input.ofStream(ClicksAndViews.VIEWS));

    PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
    PartitionKey outputPartitionKey = PartitionedFileSetArguments
            .getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());

    if (outputPartitionKey == null) {
        outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime())
                .build();/*from  w ww.  java2 s .  c o m*/
    }

    Map<String, String> outputArgs = new HashMap<>();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
    context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));

    Job job = context.getHadoopJob();
    job.setMapperClass(ImpressionKeyingMapper.class);
    job.setReducerClass(JoiningReducer.class);
}