Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java

public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName)
        throws Exception {

    Job job = new Job(conf);
    job.setJobName(jobName + "  " + crawlPath.toString());
    job.setJarByClass(Merge.class);
    // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path crawldbPath = new Path(crawlPath, "crawldb");
    Path newdb = new Path(crawldbPath, "new");
    Path currentdb = new Path(crawldbPath, "current");

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }//from  w  w w .  ja v  a2  s. c o  m

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }
    for (Path mergePath : mergePaths) {
        FileInputFormat.addInputPath(job, mergePath);
    }
    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.waitForCompletion(true);

}

From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java

public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception {
    Path segmentPath = new Path(crawlPath, "segments/" + segmentName);
    Path generatePath = new Path(segmentPath, "generate");

    Job job = new Job(conf);
    job.setJobName("fetch " + crawlPath.toString());
    job.setJarByClass(Fetcher.class);

    job.setReducerClass(FetcherReducer.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(FetcherOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    FileInputFormat.addInputPath(job, generatePath);
    FileOutputFormat.setOutputPath(job, segmentPath);

    job.waitForCompletion(true);/*from  w w  w.j a  va  2s .  c  o  m*/
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java

public static Job createJob(Configuration conf, Path crawldb) throws IOException {

    Job job = new Job(conf);
    //job.setJarByClass(Merge.class);
    job.getConfiguration().set("mapred",
            "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar");
    Path newdb = new Path(crawldb, "new");
    Path currentdb = new Path(crawldb, "current");

    FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create());
    if (fs.exists(currentdb)) {
        FileInputFormat.addInputPath(job, currentdb);
    }//from  w  w  w .  jav  a 2  s .  co  m

    if (fs.exists(newdb)) {
        fs.delete(newdb);
    }

    FileOutputFormat.setOutputPath(job, newdb);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.setMapperClass(MergeMap.class);
    job.setReducerClass(MergeReduce.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    return job;
}

From source file:cn.itcast.hadoop.mr.wordcount.DBCountPageView.java

License:Apache License

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {

    //?MySql/*from   w  ww  .ja v  a  2  s  .c om*/
    String driverClassName = DRIVER_CLASS;
    String url = DB_URL; //??

    //????
    if (args.length > 1) {
        driverClassName = args[0];
        url = args[1];
    }

    //driverClassNameurl??
    initialize(driverClassName, url);

    //hdfs?
    Configuration conf = getConf();

    //??
    DBConfiguration.configureDB(conf, driverClassName, url); //???

    //job
    Job job = Job.getInstance(conf);

    //job??
    job.setJobName("Count Pageviews of URLs");

    //job
    job.setJarByClass(DBCountPageView.class);

    //Map
    job.setMapperClass(PageviewMapper.class);

    //Combiner
    job.setCombinerClass(LongSumReducer.class);

    //reduce
    job.setReducerClass(PageviewReducer.class);

    //DB?
    //   setInput(Job job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String orderBy, String... fieldNames)
    DBInputFormat.setInput(job, AccessRecord.class, "HAccess", null, "url", AccessFieldNames); //?

    //FileOutputFormat.setoutput ?
    DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);//

    //Mapkey?
    job.setMapOutputKeyClass(Text.class);

    //MapValue?
    job.setMapOutputValueClass(LongWritable.class);

    //Reducekey?
    job.setOutputKeyClass(PageviewRecord.class);

    //Reducevalue?
    job.setOutputValueClass(NullWritable.class);

    int ret;//job?????
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;

        boolean correct = verify();
        if (!correct) {
            throw new RuntimeException("Evaluation was not correct!");
        }
    } finally {
        shutdown();
    }
    return ret;
}

From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java

License:Apache License

/** Create and setup a job */
@SuppressWarnings("deprecation")
private static Job createJob(String name, Configuration conf) throws IOException {
    final Job job = new Job(conf, NAME + "_" + name);
    final Configuration jobconf = job.getConfiguration();
    job.setJarByClass(BaileyBorweinPlouffe.class);

    // setup mapper
    job.setMapperClass(BbpMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // setup reducer
    job.setReducerClass(BbpReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(BytesWritable.class);
    job.setNumReduceTasks(1);//from w ww .  jav a 2s . co m

    // setup input
    job.setInputFormatClass(BbpInputFormat.class);

    // disable task timeout
    jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0);

    // do not use speculative execution
    jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
    jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false);
    return job;
}

From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();/*from   www  . j a v  a2 s .c o m*/
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();

    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();

    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();

    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec);

    // we checked at configure time that there is exactly one source
    String sourceName = phaseSpec.getPhase().getSources().iterator().next();

    BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName);
    batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource);
    BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics,
            new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments());
    batchSource.prepareRun(sourceContext);
    runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
    finishers.add(batchSource, sourceContext);

    Map<String, SinkOutput> sinkOutputs = new HashMap<>();

    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }

        BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName);
        batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink);
        MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments());
        batchSink.prepareRun(sinkContext);
        runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
        finishers.add(batchSink, sinkContext);

        sinkOutputs.put(sinkName,
                new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));

    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE),
            phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key",
                    Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }

    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE);
    if (!aggregators.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String aggregatorName = aggregators.iterator().next().getName();
        BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName);
        MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics,
                new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments());
        aggregator.prepareRun(aggregatorContext);
        finishers.add(aggregator, aggregatorContext);

        if (aggregatorContext.getNumPartitions() != null) {
            job.setNumReduceTasks(aggregatorContext.getNumPartitions());
        }
        // if the plugin sets the output key and value class directly, trust them
        Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
        Class<?> outputValClass = aggregatorContext.getGroupValueClass();
        // otherwise, derive it from the plugin's parameters
        if (outputKeyClass == null) {
            outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
        }
        if (outputValClass == null) {
            outputValClass = TypeChecker.getGroupValueClass(aggregator);
        }
        hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName());
        hConf.set(GROUP_VAL_CLASS, outputValClass.getName());
        // in case the classes are not a WritableComparable, but is some common type we support
        // for example, a String or a StructuredRecord
        WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName());
        // if the conversion is null, it means the user is using their own object.
        if (writableConversion != null) {
            outputKeyClass = writableConversion.getWritableClass();
        }
        writableConversion = WritableConversions.getConversion(outputValClass.getName());
        if (writableConversion != null) {
            outputValClass = writableConversion.getWritableClass();
        }
        // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error
        if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.",
                    aggregatorName, outputKeyClass));
        }
        if (!Writable.class.isAssignableFrom(outputValClass)) {
            throw new IllegalArgumentException(String.format(
                    "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.",
                    aggregatorName, outputValClass));
        }

        job.setMapOutputKeyClass(outputKeyClass);
        job.setMapOutputValueClass(outputValClass);
    } else {
        job.setNumReduceTasks(0);
    }

    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}

From source file:co.cask.cdap.examples.streamconversion.StreamConversionMapReduce.java

License:Apache License

@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
    Job job = context.getHadoopJob();
    job.setMapperClass(StreamConversionMapper.class);
    job.setNumReduceTasks(0);// w  w w.  j  a va2  s. c o  m
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, SCHEMA);

    // read 5 minutes of events from the stream, ending at the logical start time of this run
    long logicalTime = context.getLogicalStartTime();
    context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime));

    // each run writes its output to a partition with the logical start time.
    TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime);
    context.addOutput(Output.ofDataset("converted", dsArguments));

    TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments);
    LOG.info("Output location for new partition is: {}",
            partitionedFileSet.getEmbeddedFileSet().getOutputLocation());
}

From source file:co.cask.cdap.internal.app.runtime.batch.AggregateMetricsByTag.java

License:Apache License

static void configureJob(Job job) throws IOException {
    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setReducerClass(Reduce.class);
}

From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java

License:Apache License

/**
 * Sets the map output key and value classes in the job configuration by inspecting the {@link Mapper}
 * if it is not set by the user.//from  w w  w. j a  v a 2  s .c  o  m
 *
 * @param job the MapReduce job
 * @param mapperTypeToken TypeToken of a configured mapper (may not be configured on the job). Has already been
 *                        resolved from the job's mapper class.
 */
private void setMapOutputClassesIfNeeded(Job job, @Nullable TypeToken<?> mapperTypeToken) {
    Configuration conf = job.getConfiguration();

    TypeToken<?> type = mapperTypeToken;
    int keyIdx = 2;
    int valueIdx = 3;

    if (type == null) {
        // Reducer only job. Use the Reducer input types as the key/value classes.
        type = resolveClass(conf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class);
        keyIdx = 0;
        valueIdx = 1;
    }

    // If not able to detect type, nothing to set.
    if (type == null || !(type.getType() instanceof ParameterizedType)) {
        return;
    }

    Type[] typeArgs = ((ParameterizedType) type.getType()).getActualTypeArguments();

    // Set it only if the user didn't set it in beforeSubmit
    // The key and value type are in the 3rd and 4th type parameters
    if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS)) {
        Class<?> cls = TypeToken.of(typeArgs[keyIdx]).getRawType();
        LOG.debug("Set map output key class to {}", cls);
        job.setMapOutputKeyClass(cls);
    }
    if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS)) {
        Class<?> cls = TypeToken.of(typeArgs[valueIdx]).getRawType();
        LOG.debug("Set map output value class to {}", cls);
        job.setMapOutputValueClass(cls);
    }
}

From source file:co.nubetech.hiho.dedup.DedupJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    populateConfiguration(args);//w w  w  .j  av a  2  s  .c  o m
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }
    Job job = new Job(conf);
    job.setJobName("Dedup job");
    job.setJarByClass(DedupJob.class);

    Class inputFormatClass = Class.forName(inputFormat);
    Class outputFormatClass = Class.forName(outputFormat);
    Class inputKeyClass = Class.forName(inputKeyClassName);
    Class inputValueClass = Class.forName(inputValueClassName);

    if (dedupBy.equals("key")) {
        job.setMapperClass(DedupKeyMapper.class);
        job.setReducerClass(DedupKeyReducer.class);
        job.setMapOutputValueClass(inputValueClass);
    } else if (dedupBy.equals("value")) {
        job.setMapperClass(DedupValueMapper.class);
        job.setReducerClass(DedupValueReducer.class);
        job.setMapOutputValueClass(inputKeyClass);
    }

    job.setInputFormatClass(inputFormatClass);
    if (inputFormat.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) {
        DelimitedTextInputFormat.setProperties(job, delimiter, column);
    }

    job.setMapOutputKeyClass(HihoTuple.class);

    job.setOutputKeyClass(inputKeyClass);
    job.setOutputValueClass(inputValueClass);
    job.setPartitionerClass(HihoHashPartitioner.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    try {
        logger.debug("Output format class is " + job.getOutputFormatClass());
        logger.debug("Class is " + ReflectionUtils
                .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
        job.waitForCompletion(false);
        if (job.isComplete()) {
            Counters counters = job.getCounters();
            totalRecordsRead = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue();
            badRecords = counters.findCounter(DedupRecordCounter.BAD_RECORD).getValue();
            output = counters.findCounter(DedupRecordCounter.OUTPUT).getValue();
            duplicateRecords = totalRecordsRead - output;
            logger.info("Total records read are: " + totalRecordsRead);
            logger.info("Bad Records are: " + badRecords);
            logger.info("Output records are: " + output);
            logger.info("Duplicate records are: " + duplicateRecords);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
    return 0;
}