Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:com.marklogic.mapreduce.examples.LinkCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 2) {
        System.err.println("Usage: LinkCount configFile outputDir");
        System.exit(2);//from w ww .j a va2 s .c  o m
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf, "link count");
    job.setJarByClass(LinkCount.class);
    job.setInputFormatClass(ValueInputFormat.class);
    job.setMapperClass(RefMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.marklogic.mapreduce.examples.LinkCountCooccurrences.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 2) {
        System.err.println("Usage: LinkCountCooccurrences configFile outputDir");
        System.exit(2);//w  ww .  j a  v a 2 s .  c  om
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf, "link count cooccurrences");
    job.setJarByClass(LinkCountCooccurrences.class);
    job.setInputFormatClass(KeyValueInputFormat.class);
    job.setMapperClass(RefMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_KEY_CLASS, Text.class, Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class);
    conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, HrefTitleMap.class,
            ElemAttrValueCooccurrences.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.marklogic.mapreduce.examples.LinkCountValue.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    if (args.length < 2) {
        System.err.println("Usage: LinkCountValue configFile outputDir");
        System.exit(2);//w w  w  .ja  va 2s  .  co m
    }
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance(conf, "link count value");
    job.setJarByClass(LinkCountValue.class);
    job.setInputFormatClass(ValueInputFormat.class);
    job.setMapperClass(RefMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    conf = job.getConfiguration();
    conf.addResource(otherArgs[0]);
    conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class);

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.mb.saas.bi.job.WordCountJob.java

License:Apache License

public static boolean runHadoopMapReduceJob() throws Exception {
    System.setProperty("HADOOP_USER_NAME", "hadoop");

    File jarFile = UploadResource.createTempJar("bin");
    ClassLoader classLoader = UploadResource.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);

    Configuration conf = new Configuration();

    conf.set("fs.defaultFS", "hdfs://mbcluster/");
    conf.set("dfs.nameservices", "mbcluster");
    conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001");
    conf.set("dfs.client.failover.proxy.provider.mbcluster",
            "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");

    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCountJob.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    if (jarFile != null)
        ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath());

    boolean isMapReduceJarSetted = false;
    String hadoopMapReduceJar = "F:/henry_projects/mbHiveAnalyzer/t.jar";
    File file = new File(hadoopMapReduceJar);
    if (file.exists()) {
        ((JobConf) job.getConfiguration()).setJar(hadoopMapReduceJar);
        isMapReduceJarSetted = true;/*from  w  w  w  .  jav a2s.  c om*/
    }

    if (!isMapReduceJarSetted && jarFile != null)
        ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath());

    job.setNumReduceTasks(1);
    FileInputFormat.addInputPath(job, new Path("/input/wordcount.txt"));
    FileOutputFormat.setOutputPath(job, new Path("/output/001"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    return true;
}

From source file:com.mb.saas.bi.job.WordCountJob.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.setProperty("HADOOP_USER_NAME", "hadoop");

    File jarFile = UploadResource.createTempJar("bin");
    System.setProperty("hadoop.home.dir", "F:/hadoop");
    ClassLoader classLoader = UploadResource.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);

    Configuration conf = new Configuration();
    //   conf.set("fs.defaultFS", "hdfs://slave1:4001");
    //   conf.set("mapreduce.framework.name", "yarn");
    //   conf.set("yarn.resourcemanager.address", "master:8032");
    //   conf.set("yarn.resourcemanager.scheduler.address", "master:8030");

    conf.set("fs.defaultFS", "hdfs://mbcluster/");
    conf.set("dfs.nameservices", "mbcluster");
    conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001");
    conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001");
    conf.set("dfs.client.failover.proxy.provider.mbcluster",
            "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");

    conf.set("mapred.remote.os", "Linux");
    System.out.println(conf.get("mapred.remote.os"));

    //   conf.set("mapreduce.job.reduces", "2");
    //   conf.set("mapreduce.tasktracker.map.tasks.maximum", "8");
    //   conf.set("mapreduce.input.fileinputformat.split.maxsize","123");

    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCountJob.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    if (jarFile != null)
        ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath());

    //    job.setMaxMapAttempts(2);
    job.setNumReduceTasks(1);//from   w  w  w .j a va 2s . c o  m
    FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt"));
    //    FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt"));
    FileOutputFormat.setOutputPath(job, new Path("/output/001002"));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java

License:Open Source License

public boolean run() {
    try {/*ww w  .j  a v  a  2  s .c o  m*/
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = new Job(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            injectSystemProperties(groupByJob);
            groupByJob.setInputFormatClass(TextInputFormat.class);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            groupByJob.setJarByClass(DeterminePartitionsJob.class);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = new Job(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        injectSystemProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            dimSelectionJob.setInputFormatClass(TextInputFormat.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setJarByClass(DeterminePartitionsJob.class);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0));
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (fileSystem.exists(partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }

                shardSpecs.put(bucket, actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduce.java

public static Job RunJobAsync(Path inputPath, Path outputPath, Configuration conf) throws Exception {
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(HelloMapReduce.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    return job;/*ww w. jav  a  2s . co  m*/
}

From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduce.java

public static Job RunJobAnalysisAsync(Path inputPath, Path outputPath, Configuration conf) throws Exception {
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(HelloMapReduce.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumCombinerAnalyser.class);
    job.setReducerClass(IntSumReducerAnalyser.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    return job;//from  ww  w.  j a  v  a2 s .  co m
}

From source file:com.mongodb.hadoop.examples.snmp.SnmpStatisticWithCombiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    final Configuration conf = getConf();
    final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf);
    if (outputUri == null)
        throw new IllegalStateException("output uri is not set");
    if (MongoConfigUtil.getInputURI(conf) == null)
        throw new IllegalStateException("input uri is not set");
    final String outputCollectionName = outputUri.getCollection();
    final Job job = new Job(conf, "snmp analysis " + outputCollectionName);
    job.setJarByClass(SnmpStatisticWithCombiner.class);
    job.setMapperClass(MapHostUploadOnEachAPPerDay.class);
    job.setCombinerClass(CombineHostUploadOnEachAPPerDay.class);
    job.setReducerClass(ReduceHostUploadOnEachAPPerDay.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setInputFormatClass(MongoInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);
    try {//from  w w w  .  j ava  2s . c  om
        boolean result = job.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.out.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
    }
    return 0;
}

From source file:com.mongodb.hadoop.examples.snmp.SnmpStatisticWithCombiner.java

License:Apache License

public static void main(String[] args) throws Exception {
    boolean use_shards = true;
    boolean use_chunks = false;
    final Configuration Conf = new Configuration();
    MongoConfigUtil.setInputURI(Conf, "mongodb://localhost:30000/test.snmp");
    Conf.setBoolean(MongoConfigUtil.SPLITS_USE_SHARDS, use_shards);
    Conf.setBoolean(MongoConfigUtil.SPLITS_USE_CHUNKS, use_chunks);
    String output_table = null;//from   ww w . j a  v  a 2 s  . c o  m
    if (use_chunks) {
        if (use_shards)
            output_table = "snmp_with_shards_and_chunks";
        else
            output_table = "snmp_with_chunks";
    } else {
        if (use_shards)
            output_table = "snmpWithShards";
        else
            output_table = "snmp_no_splits";
    }
    MongoConfigUtil.setOutputURI(Conf, "mongodb://localhost:30000/test." + output_table);
    final Job snmpJob = new Job(Conf, "snmp analysis " + output_table);
    snmpJob.setJarByClass(SnmpStatisticWithCombiner.class);
    snmpJob.setMapperClass(MapHostUploadOnEachAPPerDay.class);
    snmpJob.setCombinerClass(CombineHostUploadOnEachAPPerDay.class);
    snmpJob.setReducerClass(ReduceHostUploadOnEachAPPerDay.class);
    snmpJob.setOutputKeyClass(Text.class);
    snmpJob.setOutputValueClass(LongWritable.class);
    snmpJob.setInputFormatClass(MongoInputFormat.class);
    snmpJob.setOutputFormatClass(MongoOutputFormat.class);
    try {
        boolean result = snmpJob.waitForCompletion(true);
        System.out.println("job.waitForCompletion( true ) returned " + result);
    } catch (Exception e) {
        System.out.println("job.waitForCompletion( true ) threw Exception");
        e.printStackTrace();
    }
}