Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.SimpleTextSearch.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(SimpleTextSearch.class);

    job.setJobName(SimpleTextSearch.class.getName());

    // mapper/*from   w  w  w . j  ava 2s.c  o m*/
    job.setMapperClass(TextSearchMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    // regex with a phrase to be searched for
    String regex = otherArgs[2];
    job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.WordCounterExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(WordCounterExample.class);

    job.setJobName(WordCounterExample.class.getName());

    // mapper/*w  w  w  . jav a2 s. c o m*/
    job.setMapperClass(WordCounterMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.TopDomainCounter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(TopDomainCounter.class);

    job.setJobName(TopDomainCounter.class.getName());

    // mapper/*from   ww w  . j  a  v  a  2s  .c o m*/
    job.setMapperClass(DomainMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.WARCRecordCounter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    System.out.println("Other args: " + Arrays.toString(otherArgs));

    Job job = Job.getInstance(conf);
    job.setJarByClass(WARCRecordCounter.class);

    job.setJobName(WARCRecordCounter.class.getName());

    // mapper/*from  www.  j  av  a 2 s .  co m*/
    job.setMapperClass(ResponseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // combiner + reducer
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:distributed.hadoop.MapReduceJobConfig.java

License:Open Source License

/**
 * Apply the settings encapsulated in this config and return a Job object
 * ready for execution.//from ww w.  j  a va2 s .c o  m
 * 
 * @param jobName the name of the job
 * @param conf the Configuration object that will be wrapped in the Job
 * @param env environment variables
 * @return a configured Job object
 * @throws IOException if a problem occurs
 * @throws ClassNotFoundException if various classes are not found
 */
public Job configureForHadoop(String jobName, Configuration conf, Environment env)
        throws IOException, ClassNotFoundException {

    String jobTrackerPort = getJobTrackerPort();
    if (DistributedJobConfig.isEmpty(jobTrackerPort)) {
        jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN
                : AbstractHadoopJobConfig.DEFAULT_PORT;
    }
    String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort;
    if (DistributedJobConfig.isEmpty(jobTracker)) {
        System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ")
                + "set - running locally...");
    } else {
        jobTracker = environmentSubstitute(jobTracker, env);
        if (AbstractHadoopJobConfig.isHadoop2()) {
            conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker);
            conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS,
                    environmentSubstitute(getJobTrackerHost(), env) + ":8030");
        } else {
            conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker);
        }
    }
    System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ")
            + jobTracker);

    if (AbstractHadoopJobConfig.isHadoop2()) {
        // a few other properties needed to run against Yarn
        conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");
        conf.set("mapreduce.framework.name", "yarn");
    }

    if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) {
        conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE
                : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize());
    }

    // Do any user supplied properties here before creating the Job
    for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }

    m_hdfsConfig.configureForHadoop(conf, env);
    Job job = new Job(conf, jobName);

    String numMappers = getNumberOfMaps();
    if (!DistributedJobConfig.isEmpty(numMappers)) {
        numMappers = environmentSubstitute(numMappers, env);
        ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers));
    }

    // The number of map tasks that will be run simultaneously by a task tracker
    String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum();
    if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) {
        ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks);
    }

    String numReducers = getNumberOfReducers();
    if (!DistributedJobConfig.isEmpty(numReducers)) {
        numReducers = environmentSubstitute(numReducers, env);
        job.setNumReduceTasks(Integer.parseInt(numReducers));

        if (Integer.parseInt(numReducers) == 0) {
            System.err.println("Warning - no reducer class set. Configuring for a map only job");
        }
    } else {
        job.setNumReduceTasks(1);
    }
    String mapperClass = getMapperClass();
    if (DistributedJobConfig.isEmpty(mapperClass)) {
        throw new IOException("No mapper class specified!");
    }
    mapperClass = environmentSubstitute(mapperClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass);

    job.setMapperClass(mc);

    String reducerClass = getReducerClass();
    if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) {
        throw new IOException("No reducer class specified!");
    } else if (job.getNumReduceTasks() > 0) {
        reducerClass = environmentSubstitute(reducerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass);

        job.setReducerClass(rc);
    }

    String combinerClass = getCombinerClass();
    if (!DistributedJobConfig.isEmpty(combinerClass)) {
        combinerClass = environmentSubstitute(combinerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass);

        job.setCombinerClass(cc);
    }

    String inputFormatClass = getInputFormatClass();
    if (DistributedJobConfig.isEmpty(inputFormatClass)) {
        throw new IOException("No input format class specified");
    }
    inputFormatClass = environmentSubstitute(inputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

    job.setInputFormatClass(ifc);

    String outputFormatClass = getOutputFormatClass();
    if (DistributedJobConfig.isEmpty(outputFormatClass)) {
        throw new IOException("No output format class specified");
    }
    outputFormatClass = environmentSubstitute(outputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass);
    job.setOutputFormatClass(ofc);

    String mapOutputKeyClass = getMapOutputKeyClass();
    if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) {
        throw new IOException("No map output key class defined");
    }
    mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env);
    Class mokc = Class.forName(mapOutputKeyClass);
    job.setMapOutputKeyClass(mokc);

    String mapOutputValueClass = getMapOutputValueClass();
    if (DistributedJobConfig.isEmpty(mapOutputValueClass)) {
        throw new IOException("No map output value class defined");
    }
    mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env);
    Class movc = Class.forName(mapOutputValueClass);
    job.setMapOutputValueClass(movc);

    String outputKeyClass = getOutputKeyClass();
    if (DistributedJobConfig.isEmpty(outputKeyClass)) {
        throw new IOException("No output key class defined");
    }
    outputKeyClass = environmentSubstitute(outputKeyClass, env);
    Class okc = Class.forName(outputKeyClass);
    job.setOutputKeyClass(okc);

    String outputValueClass = getOutputValueClass();
    if (DistributedJobConfig.isEmpty(outputValueClass)) {
        throw new IOException("No output value class defined");
    }
    outputValueClass = environmentSubstitute(outputValueClass, env);
    Class ovc = Class.forName(outputValueClass);
    job.setOutputValueClass(ovc);

    String inputPaths = getInputPaths();
    // don't complain if there aren't any as inputs such as HBASE
    // require other properties to be set
    if (!DistributedJobConfig.isEmpty(inputPaths)) {
        inputPaths = environmentSubstitute(inputPaths, env);
        FileInputFormat.setInputPaths(job, inputPaths);
    }

    String outputPath = getOutputPath();
    if (DistributedJobConfig.isEmpty(outputPath)) {
        throw new IOException("No output path specified");
    }
    outputPath = environmentSubstitute(outputPath, env);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job;
}

From source file:dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration configuration = getConf();

    Job job = new Job(configuration, "ARC Header Extractor");
    job.setJarByClass(ARCHeaderExtractorMR.class);

    job.setMapperClass(ARCHeaderExtractorMapper.class);
    job.setCombinerClass(ARCHeaderExtractorReducer.class);
    job.setReducerClass(ARCHeaderExtractorReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    int n = args.length;
    if (n == 0 || n > 2) {
        System.err.println(//from   w w  w  .  j a v a  2s. co m
                "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied.");
        System.exit(0);
    }

    SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
    SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:drdoobs.AggregateJob.java

public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(getClass());/*from  w  ww . ja va  2 s .  c  o m*/
    job.setJobName(getClass().getSimpleName());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(ProjectionMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:dz.lab.mapred.counter.StartsWithCountJob_PrintCounters.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // the following property will enable mapreduce to use its packaged local job runner
    //conf.set("mapreduce.framework.name", "local");

    Job job = Job.getInstance(conf, "StartsWithCountJob");
    job.setJarByClass(getClass());/* w  w w  .  j  av  a2 s  . c o m*/

    // configure output and input source
    TextInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(StartsWithCountMapper.class);
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int resultCode = job.waitForCompletion(true) ? 0 : 1;
    System.out.println("Job is complete! Printing Counters:");
    Counters counters = job.getCounters();

    for (String groupName : counters.getGroupNames()) {
        CounterGroup group = counters.getGroup(groupName);
        System.out.println(group.getDisplayName());

        for (Counter counter : group.getUnderlyingGroup()) {
            System.out.println(" " + counter.getDisplayName() + "=" + counter.getValue());
        }
    }
    return resultCode;
}

From source file:dz.lab.mapred.exclude.StartsWithCountJob_DistCacheAPI.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // the following property will enable mapreduce to use its packaged local job runner
    //conf.set("mapreduce.framework.name", "local");

    Job job = Job.getInstance(conf, "StartsWithCountJob");
    job.setJarByClass(getClass());/*from w w w. j  a v a2s. c o  m*/

    // configure output and input source
    TextInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(StartsWithCountMapper.class);
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    Path toCache = new Path("/training/data/startWithExcludeFile.txt");
    // add file to cache
    job.addCacheFile(toCache.toUri());
    // create symbolic links for all files in DistributedCache; without the links you would have to use fully qualified path
    job.createSymlink();

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:dz.lab.mapred.hbase.custom_input.StartsWithCountJob_HBaseInput.java

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf(), "StartsWithCount-FromHBase");
    job.setJarByClass(getClass());/*from  w w  w.j  a v a  2 s  .  c o m*/

    // set HBase InputFormat
    job.setInputFormatClass(TableInputFormat.class);
    // new mapper to handle data from HBase
    job.setMapperClass(StartsWithCountMapper_HBase.class);

    // add hbase configuration
    Configuration conf = job.getConfiguration();
    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
    TableMapReduceUtil.addDependencyJars(job);

    // specify table and column to read from
    conf.set(TableInputFormat.INPUT_TABLE, TABLE_NAME);
    conf.set(TableInputFormat.SCAN_COLUMNS, "count:word");

    // configure mapper and reducer
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[0]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}