Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.marklogic.mapreduce.examples.LinkCountValue.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 2) {
        System.err.println("Usage: LinkCountValue configFile outputDir");
        System.exit(2);/*from w ww.  jav  a  2  s  .c  o m*/
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf, "link count value");
    job.setJarByClass(LinkCountValue.class);
    job.setInputFormatClass(ValueInputFormat.class);
    job.setMapperClass(RefMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.marklogic.mapreduce.examples.WikiLoader.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 2) {
        System.err.println("Usage: WikiLoader configFile inputDir");
        System.exit(2);//from  ww w  .ja  v  a  2s .co m
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf, "wiki loader");
    job.setJarByClass(WikiLoader.class);
    job.setInputFormatClass(WikiInputFormat.class);
    job.setMapperClass(ArticleMapper.class);
    job.setMapOutputKeyClass(DocumentURI.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(ContentOutputFormat.class);

    ContentInputFormat.setInputPaths(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.marklogic.mapreduce.examples.ZipContentLoader.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ZipContentLoader configFile inputDir");
        System.exit(2);//w  w  w  .ja  v  a  2 s. c om
    }

    Job job = Job.getInstance(conf, "zip content loader");
    job.setJarByClass(ZipContentLoader.class);
    job.setInputFormatClass(ZipContentInputFormat.class);
    job.setMapperClass(ZipContentMapper.class);
    job.setMapOutputKeyClass(DocumentURI.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(ContentOutputFormat.class);

    ZipContentInputFormat.setInputPaths(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.marklogic.mapreduce.test.CustomQuery.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 1) {
        System.err.println("Usage: CustomQuery configFile");
        System.exit(2);/*ww w.  java2  s  .  co m*/
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf, "custom query");
    job.setJarByClass(CustomQuery.class);

    job.setInputFormatClass(NodeInputFormat.class);
    job.setMapperClass(QueryMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setReducerClass(QueryReducer.class);
    job.setOutputFormatClass(KeyValueOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java

License:Open Source License

public boolean run() {
    try {//from   w  ww  . j  av a  2 s  .  c  o  m
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = new Job(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            injectSystemProperties(groupByJob);
            groupByJob.setInputFormatClass(TextInputFormat.class);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            groupByJob.setJarByClass(DeterminePartitionsJob.class);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = new Job(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        injectSystemProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            dimSelectionJob.setInputFormatClass(TextInputFormat.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setJarByClass(DeterminePartitionsJob.class);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0));
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (fileSystem.exists(partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }

                shardSpecs.put(bucket, actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.SortableBytes.java

License:Open Source License

public static void useSortableBytesAsMapOutputKey(Job job) {
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setGroupingComparatorClass(SortableBytesGroupingComparator.class);
    job.setSortComparatorClass(SortableBytesSortingComparator.class);
    job.setPartitionerClass(SortableBytesPartitioner.class);
}

From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java

/**
 * @param args// w  w w .j  av a2 s. c o  m
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    int number_of_classes = 1;
    int number_of_features = 1;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    Job job = new Job(conf, "NaiveBayesClassifierMapReduce_Continuous_Features");
    job.setJarByClass(NaiveBayesClassifierMapReduce_Continuous_Features.class);

    conf = job.getConfiguration(); // This line is mandatory. 

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(MapArrayWritable.class);

    job.setMapperClass(NaiveBayesClassifierMap_Continuous_Features.class);
    job.setReducerClass(NaiveBayesClassifierReduce_Continuous_Features.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setNumReduceTasks(1);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    Path out = new Path(args[1]);
    if (fs.exists(out))
        fs.delete(out, true);
    FileOutputFormat.setOutputPath(job, out);
    number_of_classes = Integer.parseInt(args[2]);
    number_of_features = Integer.parseInt(args[3]);
    conf.setInt("number_of_classes", number_of_classes);
    conf.setInt("number_of_features", number_of_features);

    try {
        job.waitForCompletion(true);

    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.mongodb.hadoop.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**/* w  w  w.  jav  a 2s  .c  o  m*/
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    LOG.debug("Mapper Class: " + mapper);
    LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Configures the MapReduce mapper for the job.
 *
 * @param job The Hadoop MR job.//from w  w w.j  a v  a  2s. co m
 * @throws IOException If there is an error.
 */
protected void configureMapper(Job job) throws IOException {
    // Set the map class in the job configuration.
    final FijiMapper<?, ?, ?, ?> mapper = getMapper();
    if (null == mapper) {
        throw new JobConfigurationException("Must specify a mapper");
    }
    if (mapper instanceof Configurable) {
        ((Configurable) mapper).setConf(job.getConfiguration());
    }
    job.setMapperClass(((Mapper<?, ?, ?, ?>) mapper).getClass());

    // Set the map output key and map output value types in the job configuration.
    job.setMapOutputKeyClass(mapper.getOutputKeyClass());
    job.setMapOutputValueClass(mapper.getOutputValueClass());

    configureAvro(job, mapper);
    configureHTableInput(job, mapper);
}

From source file:com.mozilla.hadoop.hbase.mapreduce.MultiScanTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job.//  w  w w  . j  ava  2 s .  c  o m
 * 
 * @param table
 *            The table name to read from.
 * @param scans
 *            The scan instances with the columns, time range etc.
 * @param mapper
 *            The mapper class to use.
 * @param outputKeyClass
 *            The class of the output key.
 * @param outputValueClass
 *            The class of the output value.
 * @param job
 *            The current job to adjust.
 * @throws IOException
 *             When setting up the details fails.
 */
@SuppressWarnings("rawtypes")
public static void initMultiScanTableMapperJob(final String table, final Scan[] scans,
        final Class<? extends TableMapper> mapper, final Class<? extends WritableComparable> outputKeyClass,
        final Class<? extends Writable> outputValueClass, final Job job) throws IOException {

    job.setInputFormatClass(MultiScanTableInputFormat.class);
    if (outputValueClass != null) {
        job.setMapOutputValueClass(outputValueClass);
    }
    if (outputKeyClass != null) {
        job.setMapOutputKeyClass(outputKeyClass);
    }
    job.setMapperClass(mapper);
    job.getConfiguration().set(MultiScanTableInputFormat.INPUT_TABLE, table);
    job.getConfiguration().set(MultiScanTableInputFormat.SCANS, convertScanArrayToString(scans));
}