Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:com.benchmark.mapred.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);// w w w .ja  v a  2s . c o  m
    }
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.bigdog.hadoop.mapreduce.group.GroupApp.java

public void group() throws Exception {
    final Configuration configuration = new Configuration();

    final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), configuration);
    if (fileSystem.exists(new Path(OUT_PATH))) {
        fileSystem.delete(new Path(OUT_PATH), true);
    }/* w  w  w . j a v  a2 s . co  m*/

    final Job job = new Job(configuration, GroupApp.class.getSimpleName());

    //1.1 
    FileInputFormat.setInputPaths(job, INPUT_PATH);
    //??
    job.setInputFormatClass(TextInputFormat.class);

    //1.2Mapper
    job.setMapperClass(MyMapper.class);
    //<k2,v2>
    job.setMapOutputKeyClass(NewK2.class);
    job.setMapOutputValueClass(LongWritable.class);

    //1.3 
    job.setPartitionerClass(HashPartitioner.class);
    job.setNumReduceTasks(1);

    //1.4 TODO ??
    job.setGroupingComparatorClass(MyGroupingComparator.class);
    //1.5  TODO ??

    //2.2 reduce
    job.setReducerClass(MyReducer.class);
    //<k3,v3>
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);

    //2.3 
    FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
    //?
    job.setOutputFormatClass(TextOutputFormat.class);

    //???JobTracker
    job.waitForCompletion(true);
}

From source file:com.ci.backports.avro.mapreduce.AvroJob.java

License:Apache License

/** Configure a job's map output key schema. */
public static void setMapOutputKeySchema(Job job, Schema schema) {
    job.setMapOutputKeyClass(AvroKey.class);
    job.getConfiguration().set(KEY_MAP_OUTPUT_SCHEMA_CONFIG_FIELD, schema.toString());
    job.setGroupingComparatorClass(AvroKeyComparator.class);
    job.setSortComparatorClass(AvroKeyComparator.class);
    addAvroSerialization(job.getConfiguration());
}

From source file:com.cloudera.crunch.GroupingOptions.java

License:Open Source License

public void configure(Job job) {
    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }/*from  w  ww  . j  a v  a2s  .c o m*/
    if (groupingComparatorClass != null) {
        job.setGroupingComparatorClass(groupingComparatorClass);
    }
    if (sortComparatorClass != null) {
        job.setSortComparatorClass(sortComparatorClass);
    }
    if (numReducers > 0) {
        job.setNumReduceTasks(numReducers);
        LOG.info(String.format("Using %d reduce tasks", numReducers));
    }
}

From source file:com.conversantmedia.mapreduce.io.CompositeSortKeySerialization.java

License:Apache License

/**
 * Convenience method to configure the job for using the composite key.
 * @param job            the job using this serializer
 * @param groupKeyClass      the key type used for grouping
 * @param sortKeyClass      the key type used for sorting
 *//*from ww w. j a va2s  .co  m*/
@SuppressWarnings("rawtypes")
public static void configureMapOutputKey(Job job, Class<? extends WritableComparable> groupKeyClass,
        Class<? extends WritableComparable> sortKeyClass) {

    // First, setup our classes...
    job.getConfiguration().set(CONF_KEY_GROUPKEY_CLASS, groupKeyClass.getName());
    job.getConfiguration().set(CONF_KEY_SORTKEY_CLASS, sortKeyClass.getName());

    // Set this class as our map output key
    job.setMapOutputKeyClass(CompositeSortKey.class);

    // Setup the partitioner and comparators.
    job.setPartitionerClass(CompositeSortKey.KeyPartitioner.class);
    job.setGroupingComparatorClass(CompositeSortKey.GroupingComparator.class);
    job.setSortComparatorClass(CompositeSortKey.NaturalSortComparator.class);

    // Now setup the serialization by registering with the framework.
    Collection<String> serializations = new ArrayList<>();
    serializations.add(CompositeSortKeySerialization.class.getName());
    serializations.addAll(job.getConfiguration().getStringCollection("io.serializations"));
    job.getConfiguration().setStrings("io.serializations",
            serializations.toArray(new String[serializations.size()]));

}

From source file:com.conversantmedia.mapreduce.tool.annotation.handler.GroupingAnnotationHandler.java

License:Apache License

@Override
public void process(Annotation annotation, Job job, Object target) {
    Grouping grouping = (Grouping) annotation;
    if (grouping != null && grouping.value() != null && grouping.value() != NULLCOMPARATOR.class) {
        job.setGroupingComparatorClass(grouping.value());
    }/* w w  w. ja  v a 2  s.  c  om*/
}

From source file:com.daleway.training.hadoop.condprob.ConditionalProbabilityPairsSecondarySort.java

License:Apache License

public static Job createJob(Configuration conf, String inputPath, String outputPath) throws IOException {
    Job job = new Job(conf, "pair wise count");
    job.setJarByClass(ConditionalProbabilityPairsSecondarySort.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setSortComparatorClass(KeyComparator.class);
    job.setGroupingComparatorClass(GroupComparator.class);
    //job.setCombinerClass(IntSumReducer.class);
    job.setPartitionerClass(ProbDistPartitioner.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    //Is the output value class for Map or Reduce ?
    job.setOutputValueClass(Text.class);
    //job.setNumReduceTasks(5);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    return job;//from w  ww .j a v  a 2 s .c o m
}

From source file:com.datasalt.pangool.benchmark.secondarysort.HadoopSecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/* w  w  w. j a  va  2  s  . c  om*/
    }
    Job job = new Job(conf, "Hadoop Secondary Sort");
    FileSystem fS = FileSystem.get(conf);
    fS.delete(new Path(otherArgs[1]), true);

    job.setJarByClass(HadoopSecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    job.setPartitionerClass(KeyPartitioner.class);
    job.setGroupingComparatorClass(GroupingComparator.class);

    job.setMapOutputKeyClass(ComplexType.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.datasalt.pangool.tuplemr.TupleMRBuilder.java

License:Apache License

public Job createJob() throws IOException, TupleMRException {

    failIfNull(tupleReducer, "Need to set a group handler");
    failIfEmpty(multipleInputs.getMultiInputs(), "Need to add at least one input");
    failIfNull(outputFormat, "Need to set output format");
    failIfNull(outputKeyClass, "Need to set outputKeyClass");
    failIfNull(outputValueClass, "Need to set outputValueClass");
    failIfNull(outputPath, "Need to set outputPath");

    // perform a deep copy of the Configuration
    this.conf = new Configuration(this.conf);

    TupleMRConfig tupleMRConf = buildConf();
    // Serialize PangoolConf in Hadoop Configuration
    instanceFilesCreated.addAll(TupleMRConfig.set(tupleMRConf, conf));
    Job job = (jobName == null) ? new Job(conf) : new Job(conf, jobName);
    if (tupleMRConf.getRollupFrom() != null) {
        job.setReducerClass(RollupReducer.class);
    } else {/*ww  w  .  j  a v a 2s .  c  om*/
        job.setReducerClass(SimpleReducer.class);
    }

    if (tupleCombiner != null) {
        job.setCombinerClass(SimpleCombiner.class); // not rollup by now
        // Set Combiner Handler
        String uniqueName = UUID.randomUUID().toString() + '.' + "combiner-handler.dat";
        try {
            InstancesDistributor.distribute(tupleCombiner, uniqueName, job.getConfiguration());
            instanceFilesCreated.add(uniqueName);
            job.getConfiguration().set(SimpleCombiner.CONF_COMBINER_HANDLER, uniqueName);
        } catch (URISyntaxException e1) {
            throw new TupleMRException(e1);
        }
    }

    // Set Tuple Reducer
    try {
        String uniqueName = UUID.randomUUID().toString() + '.' + "group-handler.dat";
        InstancesDistributor.distribute(tupleReducer, uniqueName, job.getConfiguration());
        instanceFilesCreated.add(uniqueName);
        job.getConfiguration().set(SimpleReducer.CONF_REDUCER_HANDLER, uniqueName);
    } catch (URISyntaxException e1) {
        throw new TupleMRException(e1);
    }

    // Enabling serialization
    TupleSerialization.enableSerialization(job.getConfiguration());

    job.setJarByClass((jarByClass != null) ? jarByClass : tupleReducer.getClass());
    job.setMapOutputKeyClass(DatumWrapper.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setPartitionerClass(TupleHashPartitioner.class);
    job.setGroupingComparatorClass(GroupComparator.class);
    job.setSortComparatorClass(SortComparator.class);
    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    FileOutputFormat.setOutputPath(job, outputPath);
    instanceFilesCreated.addAll(multipleInputs.configureJob(job));
    instanceFilesCreated.addAll(namedOutputs.configureJob(job));
    // Configure a {@link ProxyOutputFormat} for Pangool's Multiple Outputs to
    // work: {@link PangoolMultipleOutput}
    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {
        InstancesDistributor.distribute(outputFormat, uniqueName, conf);
        instanceFilesCreated.add(uniqueName);
    } catch (URISyntaxException e1) {
        throw new TupleMRException(e1);
    }
    job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
    job.setOutputFormatClass(ProxyOutputFormat.class);

    return job;
}

From source file:com.datasalt.utils.mapred.counter.MapRedCounter.java

License:Apache License

protected static Job buildMapRedCounterJobWithoutCombiner(String name,
        @SuppressWarnings("rawtypes") Class<? extends OutputFormat> outputFormat, String outPath,
        Configuration conf) throws IOException {

    Job job = new Job(conf, name);

    Path output = new Path(outPath);
    HadoopUtils.deleteIfExists(FileSystem.get(conf), output);
    job.setJarByClass(MapRedCounter.class);

    job.setReducerClass(MapRedCountReducer.class);
    job.setMapOutputKeyClass(CounterKey.class);
    job.setMapOutputValueClass(CounterValue.class);
    job.setOutputFormatClass(outputFormat);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // Secondary sorting configuration.
    job.setGroupingComparatorClass(CounterKey.IdGroupComparator.class);
    job.setPartitionerClass(CounterKey.IdGroupPartitioner.class);

    FileOutputFormat.setOutputPath(job, output);

    String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat";
    try {/*w w  w .  j ava2s  .co  m*/
        DCUtils.serializeToDC(new HadoopOutputFormat(SequenceFileOutputFormat.class), uniqueName, conf);
        job.getConfiguration().set(ProxyOutputFormat.PROXIED_OUTPUT_FORMAT_CONF, uniqueName);
        job.setOutputFormatClass(ProxyOutputFormat.class);
        // Multioutput configuration
        PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTFILE.toString(),
                new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterKey.class, LongWritable.class);
        PangoolMultipleOutputs.addNamedOutput(job, Outputs.COUNTDISTINCTFILE.toString(),
                new HadoopOutputFormat(SequenceFileOutputFormat.class), CounterDistinctKey.class,
                LongPairWritable.class);
    } catch (URISyntaxException e) {
        e.printStackTrace();
        throw new IOException(e);
    }
    return job;
}