Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:com.phantom.hadoop.examples.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysort <in> <out>");
        System.exit(2);//  w  w w. j  a va 2s.c  o  m
    }
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.placeiq.piqconnect.Runner.java

License:Apache License

private Job buildJob1(Path input1, Path input2, Path output) throws Exception {
    Configuration conf = getConf();
    conf.setInt(Constants.PROP_BLOCK_SIZE, blockSize);
    conf.set("mapred.output.compression.type", "BLOCK");

    Job job = new Job(conf, "data-piqid.piqconnect.IterationStage1");
    job.setJarByClass(Runner.class);

    job.setMapperClass(IterationStage1._Mapper.class);
    job.setReducerClass(IterationStage1._Reducer.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numberOfReducers);
    job.setMapOutputKeyClass(IterationStage1.JoinKey.class);
    job.setMapOutputValueClass(BlockWritable.class);
    job.setOutputKeyClass(VLongWritable.class);
    job.setOutputValueClass(BlockWritable.class);
    job.setGroupingComparatorClass(IterationStage1.IndexComparator.class);
    job.setPartitionerClass(IterationStage1.IndexPartitioner.class);
    job.setSortComparatorClass(IterationStage1.SortComparator.class);

    FileInputFormat.setInputPaths(job, input1, input2);
    SequenceFileOutputFormat.setOutputPath(job, output);
    SequenceFileOutputFormat.setCompressOutput(job, true);

    setCompression(job);//w w w  . j  av  a 2  s . co m

    return job;
}

From source file:com.sirius.hadoop.job.onlinetime.OnlineTimeJob.java

License:Apache License

public Job build() throws Exception {
    //init//w w  w .jav  a  2 s  .co m
    Job job = Job.getInstance(getConf(), "onlinetime");
    job.setJarByClass(OnlineTimeJob.class);

    //mapp
    job.setMapperClass(StatusMapper.class);
    job.setMapOutputKeyClass(StatusKey.class);
    job.setMapOutputValueClass(OnlineRecord.class);

    //custom partition
    job.setPartitionerClass(StatusKeyPartitioner.class);

    //reduce
    job.setGroupingComparatorClass(StatusKeyGroupComparator.class);
    job.setReducerClass(StatusReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    //input
    FileInputFormat.setInputPaths(job, new Path("/subscriber_status/subscriber_status.json"));

    //output
    FileOutputFormat.setOutputPath(job, out);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, Lz4Codec.class);

    return job;
}

From source file:com.twitter.elephanttwin.indexing.AbstractBlockIndexingJob.java

License:Open Source License

/**
 * Sets up various job properites required for the indexing job.
 * If your implementation needs to mess with the conf, you can do so by overriding
 * this method (remember to call super.setupJob()!) or in setMapper().
 * @param conf// w ww .j  a  v  a2 s . c om
 * @return
 * @throws IOException
 */
protected Job setupJob(Configuration conf) throws IOException {
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(BlockIndexedFileInputFormat.class);
    job.setReducerClass(MapFileIndexingReducer.class);
    job.setMapOutputKeyClass(TextLongPairWritable.class);
    job.setMapOutputValueClass(LongPairWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ListLongPair.class);
    job.setPartitionerClass(TextLongPairWritable.Parititioner.class);
    job.setSortComparatorClass(TextLongPairWritable.PairComparator.class);
    job.setGroupingComparatorClass(TextLongPairWritable.KeyOnlyComparator.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setNumReduceTasks(getNumPartitions());
    BlockIndexedFileInputFormat.setIndexOptions(job, getInputFormat(), getValueClass(), getIndex(),
            getColumnName());
    return job;
}

From source file:com.veera.secondarysort.demo2.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);/*from  w  w  w.j  ava2 s  .co  m*/

    return 0;
}

From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java

License:Open Source License

public int run(String[] args) throws Exception {
    SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(),
            "Generates a keyword index from RDF data.",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts",
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm',
                            METHOD_ARG, "horizontal or vertical."),
                    new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                            'p', PREDICATES_ARG, "Subset of the properties to be indexed."),
                    new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r',
                            RESOURCE_PREFIX_ARG,
                            "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"),

                    new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED,
                            "Number of documents to index"),
                    new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the output."),
                    new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location of the resources hash file."), });

    JSAPResult jsapResult = jsap.parse(args);

    // check whether the command line was valid, and if it wasn't,
    // display usage information and exit.
    if (!jsapResult.success()) {
        System.err.println();/*ww w.j ava  2 s .c o  m*/
        System.err.println("Usage: java " + TripleIndexGenerator.class.getName());
        System.err.println("                " + jsap.getUsage());
        System.err.println();
        System.exit(1);
    }

    Job job = Job.getInstance(getConf());
    job.setJarByClass(TripleIndexGenerator.class);
    job.setJobName("TripleIndexGenerator" + System.currentTimeMillis());

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input")));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(DocumentMapper.class);
    job.setMapOutputKeyClass(TermKey.class);
    job.setMapOutputValueClass(TermValue.class);

    job.setPartitionerClass(TermKey.FirstPartitioner.class);
    job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class);

    job.setReducerClass(TermReduce.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IndexRecordWriterValue.class);
    job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output")));

    Configuration conf = job.getConfiguration();

    conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class);
    conf.set("mapreduce.user.classpath.first", "true");

    long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG);
    conf.setLong(NUMBER_OF_DOCUMENTS, numDocs);
    // Set this in a attempt to get around the 2GB of ram task limit on our cluster.
    // Setting this in the hope of fixing Direct buffer memory errors
    conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024);

    conf.set(OUTPUT_DIR, jsapResult.getString("output"));

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) {
        HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG));
    } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) {
        if (!jsapResult.contains(PREDICATES_ARG)) {
            throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL
                    + "' you have to give a predicates file too.");
        }
        VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG));
    } else {
        throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '"
                + METHOD_ARG_VALUE_VERTICAL + "'");
    }

    conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024);

    boolean success = job.waitForCompletion(true);

    return success ? 0 : 1;
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        if (args.length != 3) {
            JobBuilder.printUsage(this, "<ncdc input> <station input> <output>");
            return -1;
        }/*from   www . ja  va  2 s. co m*/

        Job job = new Job(getConf(), "Join weather records with station names");
        job.setJarByClass(getClass());

        Path ncdcInputPath = new Path(args[0]);
        Path stationInputPath = new Path(args[1]);
        Path outputPath = new Path(args[2]);

        MultipleInputs.addInputPath(job, ncdcInputPath, TextInputFormat.class, JoinRecordMapper.class);
        MultipleInputs.addInputPath(job, stationInputPath, TextInputFormat.class, JoinStationMapper.class);
        FileOutputFormat.setOutputPath(job, outputPath);

        /*[*/job.setPartitionerClass(KeyPartitioner.class);
        job.setGroupingComparatorClass(TextPair.FirstComparator.class);/*]*/

        job.setMapOutputKeyClass(TextPair.class);

        job.setReducerClass(JoinReducer.class);

        job.setOutputKeyClass(Text.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        Job job = JobBuilder.parseInputAndOutput(this, getConf(), args);
        if (job == null) {
            return -1;
        }//from  w  w w .  ja va  2s . c om

        job.setMapperClass(MaxTemperatureMapper.class);
        /*[*/job.setPartitionerClass(FirstPartitioner.class);
        /*]*/
        /*[*/job.setSortComparatorClass(KeyComparator.class);
        /*]*/
        /*[*/job.setGroupingComparatorClass(GroupComparator.class);/*]*/
        job.setReducerClass(MaxTemperatureReducer.class);
        job.setOutputKeyClass(IntPair.class);
        job.setOutputValueClass(NullWritable.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }

From source file:csc555.ebratt.depaul.edu.RCTop10Driver.java

License:Open Source License

/**
 * /*from w  ww  . ja  v a2 s. c o  m*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
public int run(String[] args) throws Exception {

    Job job = new Job(getConf(), "Top 10 Reddit");

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // ensure 1 reduce tasks for ranking
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(RCTop10Mapper.class);
    job.setReducerClass(RCTop10Reducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(GroupByCountPair.class);
    job.setMapOutputValueClass(Text.class);

    // set custom partitioner
    job.setPartitionerClass(GroupByCountPairPartitioner.class);

    // set custom grouping comparator
    job.setGroupingComparatorClass(GroupByGroupingComparator.class);

    // input class
    job.setInputFormatClass(KeyValueTextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(GroupByCountPair.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // The Jar file to run
    job.setJarByClass(RCTop10Driver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from   w ww . j a  v  a  2s  . c o  m*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    con.set(WINDOW_TYPE, mode.toString());
    con.setInt(WINDOW_SIZE, winsize);

    if (mode.toString().equalsIgnoreCase("DOCUMENT")) {
        con.setInt("mapred.job.map.memory.mb", 3000);

        con.set("mapred.child.java.opts", "-Xmx2900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx8000M");

        con.setInt("mapred.job.reduce.memory.mb", 8120);
    } else {
        con.setInt("mapred.job.map.memory.mb", 2000);

        con.set("mapred.child.java.opts", "-Xmx1900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx2900M");

        con.setInt("mapred.job.reduce.memory.mb", 3000);
    }
    con.setBoolean("mapred.compress.map.output", true);
    con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setBoolean("mapred.compress.output", true);
    con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setInt("mapred.task.timeout", 6000000);
    con.setInt("io.sort.factor", 50);
    con.setInt("mapreduce.map.tasks", 256);
    con.setInt("dfs.replication", 1);
    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(512);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}