Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.SimpleTextSearch.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(SimpleTextSearch.class);

    job.setJobName(SimpleTextSearch.class.getName());

    // mapper/*from   w  w  w . j  ava 2s.c  o m*/
    job.setMapperClass(TextSearchMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    // regex with a phrase to be searched for
    String regex = otherArgs[2];
    job.getConfiguration().set(MAPREDUCE_MAP_REGEX, regex);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.examples.WordCounterExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(WordCounterExample.class);

    job.setJobName(WordCounterExample.class.getName());

    // mapper/*w  w  w  . jav a2 s. c o m*/
    job.setMapperClass(WordCounterMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.TopDomainCounter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    Job job = Job.getInstance();
    job.setJarByClass(TopDomainCounter.class);

    job.setJobName(TopDomainCounter.class.getName());

    // mapper/*from   ww w  . j  a  v  a  2s  .c o m*/
    job.setMapperClass(DomainMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // combiner + reducer
    job.setCombinerClass(TextLongCountingReducer.class);
    job.setReducerClass(TextLongCountingReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.statistics.WARCRecordCounter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    System.out.println("Other args: " + Arrays.toString(otherArgs));

    Job job = Job.getInstance(conf);
    job.setJarByClass(WARCRecordCounter.class);

    job.setJobName(WARCRecordCounter.class.getName());

    // mapper/*from  www.  j  av  a 2 s .  co m*/
    job.setMapperClass(ResponseMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // combiner + reducer
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.setInputFormatClass(WARCInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // paths
    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:distributed.hadoop.MapReduceJobConfig.java

License:Open Source License

/**
 * Apply the settings encapsulated in this config and return a Job object
 * ready for execution.//from ww w.  j  a va2 s .c o  m
 * 
 * @param jobName the name of the job
 * @param conf the Configuration object that will be wrapped in the Job
 * @param env environment variables
 * @return a configured Job object
 * @throws IOException if a problem occurs
 * @throws ClassNotFoundException if various classes are not found
 */
public Job configureForHadoop(String jobName, Configuration conf, Environment env)
        throws IOException, ClassNotFoundException {

    String jobTrackerPort = getJobTrackerPort();
    if (DistributedJobConfig.isEmpty(jobTrackerPort)) {
        jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN
                : AbstractHadoopJobConfig.DEFAULT_PORT;
    }
    String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort;
    if (DistributedJobConfig.isEmpty(jobTracker)) {
        System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ")
                + "set - running locally...");
    } else {
        jobTracker = environmentSubstitute(jobTracker, env);
        if (AbstractHadoopJobConfig.isHadoop2()) {
            conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker);
            conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS,
                    environmentSubstitute(getJobTrackerHost(), env) + ":8030");
        } else {
            conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker);
        }
    }
    System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ")
            + jobTracker);

    if (AbstractHadoopJobConfig.isHadoop2()) {
        // a few other properties needed to run against Yarn
        conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");
        conf.set("mapreduce.framework.name", "yarn");
    }

    if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) {
        conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE
                : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize());
    }

    // Do any user supplied properties here before creating the Job
    for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }

    m_hdfsConfig.configureForHadoop(conf, env);
    Job job = new Job(conf, jobName);

    String numMappers = getNumberOfMaps();
    if (!DistributedJobConfig.isEmpty(numMappers)) {
        numMappers = environmentSubstitute(numMappers, env);
        ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers));
    }

    // The number of map tasks that will be run simultaneously by a task tracker
    String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum();
    if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) {
        ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks);
    }

    String numReducers = getNumberOfReducers();
    if (!DistributedJobConfig.isEmpty(numReducers)) {
        numReducers = environmentSubstitute(numReducers, env);
        job.setNumReduceTasks(Integer.parseInt(numReducers));

        if (Integer.parseInt(numReducers) == 0) {
            System.err.println("Warning - no reducer class set. Configuring for a map only job");
        }
    } else {
        job.setNumReduceTasks(1);
    }
    String mapperClass = getMapperClass();
    if (DistributedJobConfig.isEmpty(mapperClass)) {
        throw new IOException("No mapper class specified!");
    }
    mapperClass = environmentSubstitute(mapperClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass);

    job.setMapperClass(mc);

    String reducerClass = getReducerClass();
    if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) {
        throw new IOException("No reducer class specified!");
    } else if (job.getNumReduceTasks() > 0) {
        reducerClass = environmentSubstitute(reducerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass);

        job.setReducerClass(rc);
    }

    String combinerClass = getCombinerClass();
    if (!DistributedJobConfig.isEmpty(combinerClass)) {
        combinerClass = environmentSubstitute(combinerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass);

        job.setCombinerClass(cc);
    }

    String inputFormatClass = getInputFormatClass();
    if (DistributedJobConfig.isEmpty(inputFormatClass)) {
        throw new IOException("No input format class specified");
    }
    inputFormatClass = environmentSubstitute(inputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

    job.setInputFormatClass(ifc);

    String outputFormatClass = getOutputFormatClass();
    if (DistributedJobConfig.isEmpty(outputFormatClass)) {
        throw new IOException("No output format class specified");
    }
    outputFormatClass = environmentSubstitute(outputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass);
    job.setOutputFormatClass(ofc);

    String mapOutputKeyClass = getMapOutputKeyClass();
    if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) {
        throw new IOException("No map output key class defined");
    }
    mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env);
    Class mokc = Class.forName(mapOutputKeyClass);
    job.setMapOutputKeyClass(mokc);

    String mapOutputValueClass = getMapOutputValueClass();
    if (DistributedJobConfig.isEmpty(mapOutputValueClass)) {
        throw new IOException("No map output value class defined");
    }
    mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env);
    Class movc = Class.forName(mapOutputValueClass);
    job.setMapOutputValueClass(movc);

    String outputKeyClass = getOutputKeyClass();
    if (DistributedJobConfig.isEmpty(outputKeyClass)) {
        throw new IOException("No output key class defined");
    }
    outputKeyClass = environmentSubstitute(outputKeyClass, env);
    Class okc = Class.forName(outputKeyClass);
    job.setOutputKeyClass(okc);

    String outputValueClass = getOutputValueClass();
    if (DistributedJobConfig.isEmpty(outputValueClass)) {
        throw new IOException("No output value class defined");
    }
    outputValueClass = environmentSubstitute(outputValueClass, env);
    Class ovc = Class.forName(outputValueClass);
    job.setOutputValueClass(ovc);

    String inputPaths = getInputPaths();
    // don't complain if there aren't any as inputs such as HBASE
    // require other properties to be set
    if (!DistributedJobConfig.isEmpty(inputPaths)) {
        inputPaths = environmentSubstitute(inputPaths, env);
        FileInputFormat.setInputPaths(job, inputPaths);
    }

    String outputPath = getOutputPath();
    if (DistributedJobConfig.isEmpty(outputPath)) {
        throw new IOException("No output path specified");
    }
    outputPath = environmentSubstitute(outputPath, env);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job;
}

From source file:dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration configuration = getConf();

    Job job = new Job(configuration, "ARC Header Extractor");
    job.setJarByClass(ARCHeaderExtractorMR.class);

    job.setMapperClass(ARCHeaderExtractorMapper.class);
    job.setCombinerClass(ARCHeaderExtractorReducer.class);
    job.setReducerClass(ARCHeaderExtractorReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    int n = args.length;
    if (n == 0 || n > 2) {
        System.err.println(//from   w w  w  .  j a v a  2s. co m
                "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied.");
        System.exit(0);
    }

    SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
    SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:drdoobs.AggregateJob.java

public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    job.setJarByClass(getClass());/*from  w  ww . ja va  2 s .  c  o m*/
    job.setJobName(getClass().getSimpleName());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(ProjectionMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:dz.lab.mapred.counter.StartsWithCountJob_PrintCounters.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // the following property will enable mapreduce to use its packaged local job runner
    //conf.set("mapreduce.framework.name", "local");

    Job job = Job.getInstance(conf, "StartsWithCountJob");
    job.setJarByClass(getClass());/* w  w w  .  j  av  a2 s  . c o m*/

    // configure output and input source
    TextInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(StartsWithCountMapper.class);
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    int resultCode = job.waitForCompletion(true) ? 0 : 1;
    System.out.println("Job is complete! Printing Counters:");
    Counters counters = job.getCounters();

    for (String groupName : counters.getGroupNames()) {
        CounterGroup group = counters.getGroup(groupName);
        System.out.println(group.getDisplayName());

        for (Counter counter : group.getUnderlyingGroup()) {
            System.out.println(" " + counter.getDisplayName() + "=" + counter.getValue());
        }
    }
    return resultCode;
}

From source file:dz.lab.mapred.exclude.StartsWithCountJob_DistCacheAPI.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // the following property will enable mapreduce to use its packaged local job runner
    //conf.set("mapreduce.framework.name", "local");

    Job job = Job.getInstance(conf, "StartsWithCountJob");
    job.setJarByClass(getClass());/*from w w w. j  a v a2s. c o  m*/

    // configure output and input source
    TextInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(StartsWithCountMapper.class);
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    Path toCache = new Path("/training/data/startWithExcludeFile.txt");
    // add file to cache
    job.addCacheFile(toCache.toUri());
    // create symbolic links for all files in DistributedCache; without the links you would have to use fully qualified path
    job.createSymlink();

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:dz.lab.mapred.hbase.custom_input.StartsWithCountJob_HBaseInput.java

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf(), "StartsWithCount-FromHBase");
    job.setJarByClass(getClass());/*from  w w  w.j  a v a  2 s  .  c o m*/

    // set HBase InputFormat
    job.setInputFormatClass(TableInputFormat.class);
    // new mapper to handle data from HBase
    job.setMapperClass(StartsWithCountMapper_HBase.class);

    // add hbase configuration
    Configuration conf = job.getConfiguration();
    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
    TableMapReduceUtil.addDependencyJars(job);

    // specify table and column to read from
    conf.set(TableInputFormat.INPUT_TABLE, TABLE_NAME);
    conf.set(TableInputFormat.SCAN_COLUMNS, "count:word");

    // configure mapper and reducer
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[0]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}