Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java

License:Apache License

private int runQueryCopy() throws Exception {
    log.info("Setting up Copy Tool with a query-based ruleset...");
    setup();//w  w w  . j  ava  2 s .  c o m
    if (!useCopyFileOutput) {
        createChildInstance(conf);
    }

    // Set up the configuration
    final AccumuloRdfConfiguration aconf = new AccumuloRdfConfiguration(conf);
    aconf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock);
    aconf.setTablePrefix(tablePrefix);
    aconf.setFlush(false);
    ConfigUtils.setIndexers(aconf);

    // Since we're copying at the statement-level, ignore any given list of tables and determine
    // which tables we might need to create based on which indexers are desired.
    final TablePrefixLayoutStrategy prefixStrategy = new TablePrefixLayoutStrategy(tablePrefix);
    tables.clear();
    // Always include core tables
    tables.add(prefixStrategy.getSpo());
    tables.add(prefixStrategy.getOsp());
    tables.add(prefixStrategy.getPo());
    // Copy namespaces if they exist
    tables.add(prefixStrategy.getNs());
    // Add tables associated with any configured indexers
    /* TODO: SEE RYA-160
    if (aconf.getBoolean(ConfigUtils.USE_FREETEXT, false)) {
    tables.add(ConfigUtils.getFreeTextDocTablename(conf));
    tables.add(ConfigUtils.getFreeTextTermTablename(conf));
    }
    if (aconf.getBoolean(ConfigUtils.USE_GEO, false)) {
    tables.add(ConfigUtils.getGeoTablename(conf));
    }
    if (aconf.getBoolean(ConfigUtils.USE_TEMPORAL, false)) {
    tables.add(ConfigUtils.getTemporalTableName(conf));
    }
    if (aconf.getBoolean(ConfigUtils.USE_ENTITY, false)) {
    tables.add(ConfigUtils.getEntityTableName(conf));
    }
    */
    // Ignore anything else, e.g. statistics -- must be recalculated for the child if desired

    // Extract the ruleset, and copy the namespace table directly
    final AccumuloQueryRuleset ruleset = new AccumuloQueryRuleset(aconf);
    ruleset.addTable(prefixStrategy.getNs());
    for (final String line : ruleset.toString().split("\n")) {
        log.info(line);
    }

    // Create a Job and configure its input and output
    final Job job = Job.getInstance(aconf);
    job.setJarByClass(this.getClass());
    setupMultiTableInputFormat(job, ruleset);
    setupAccumuloOutput(job, "");

    if (useCopyFileOutput) {
        // Configure job for file output
        job.setJobName("Ruleset-based export to file: " + tablePrefix + " -> " + localBaseOutputDir);
        // Map (row) to (table+key, key+value)
        job.setMapperClass(RowRuleMapper.class);
        job.setMapOutputKeyClass(GroupedRow.class);
        job.setMapOutputValueClass(GroupedRow.class);
        // Group according to table and and sort according to key
        job.setGroupingComparatorClass(GroupedRow.GroupComparator.class);
        job.setSortComparatorClass(GroupedRow.SortComparator.class);
        // Reduce ([table+row], rows): output each row to the file for that table, in sorted order
        job.setReducerClass(MultipleFileReducer.class);
        job.setOutputKeyClass(Key.class);
        job.setOutputValueClass(Value.class);
    } else {
        // Configure job for table output
        job.setJobName("Ruleset-based copy: " + tablePrefix + " -> " + childTablePrefix);
        // Map (row): convert to statement, insert to child (for namespace table, output row directly)
        job.setMapperClass(AccumuloRyaRuleMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Mutation.class);
        job.setNumReduceTasks(0);
        // Create the child tables, so mappers don't try to do this in parallel
        for (final String parentTable : tables) {
            final String childTable = parentTable.replaceFirst(tablePrefix, childTablePrefix);
            createTableIfNeeded(childTable);
        }
    }

    // Run the job and copy files to local filesystem if needed
    final Date beginTime = new Date();
    log.info("Job started: " + beginTime);
    final boolean success = job.waitForCompletion(true);
    if (success) {
        if (useCopyFileOutput) {
            log.info("Moving data from HDFS to the local file system");
            final Path baseOutputPath = new Path(baseOutputDir);
            for (final FileStatus status : FileSystem.get(conf).listStatus(baseOutputPath)) {
                if (status.isDirectory()) {
                    final String tableName = status.getPath().getName();
                    final Path hdfsPath = getPath(baseOutputDir, tableName);
                    final Path localPath = getPath(localBaseOutputDir, tableName);
                    log.info("HDFS directory: " + hdfsPath.toString());
                    log.info("Local directory: " + localPath.toString());
                    copyHdfsToLocal(hdfsPath, localPath);
                }
            }
        }
        final Date endTime = new Date();
        log.info("Job finished: " + endTime);
        log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds.");
        return 0;
    } else {
        log.error("Job failed!!!");
        return 1;
    }
}

From source file:org.apache.tez.mapreduce.examples.SecondarySort.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysort <in> <out>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }/*from   w ww  . ja v  a 2  s  .com*/
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:org.avenir.association.AssociationRuleMiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Association rule mining from frequent item sets";
    job.setJobName(jobName);//from  ww w .j a va  2s. c  om

    job.setJarByClass(AssociationRuleMiner.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "avenir");
    job.setMapperClass(AssociationRuleMiner.RuleMinerMapper.class);
    job.setReducerClass(AssociationRuleMiner.RuleMinerReducer.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    int numReducer = job.getConfiguration().getInt("arm.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.avenir.knn.FeatureCondProbJoiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Training vector feature cond probability joiner  MR";
    job.setJobName(jobName);// w  w  w . j a  v a 2  s  . c o m

    job.setJarByClass(FeatureCondProbJoiner.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(FeatureCondProbJoiner.JoinerMapper.class);
    job.setReducerClass(FeatureCondProbJoiner.JoinerReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.avenir.knn.NearestNeighbor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "K nerest neighbor(KNN)  MR";
    job.setJobName(jobName);/*  www  .  j a  v  a2  s.  co m*/

    job.setJarByClass(NearestNeighbor.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(NearestNeighbor.TopMatchesMapper.class);
    job.setReducerClass(NearestNeighbor.TopMatchesReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.avenir.reinforce.RandomFirstGreedyBandit.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Random first greedy  bandit problem";
    job.setJobName(jobName);//w w  w  .  j  av a2  s .c o m

    job.setJarByClass(RandomFirstGreedyBandit.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "avenir");
    job.setMapperClass(RandomFirstGreedyBandit.BanditMapper.class);
    job.setReducerClass(RandomFirstGreedyBandit.BanditReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.avenir.sequence.CandidateGenerationWithSelfJoin.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Generates k candidate sequence";
    job.setJobName(jobName);/*from   w w w .  j  a v  a 2  s  .c  o m*/

    job.setJarByClass(CandidateGenerationWithSelfJoin.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration(), "avenir");
    job.setMapperClass(CandidateGenerationWithSelfJoin.CandidateGenerationMapper.class);
    job.setReducerClass(CandidateGenerationWithSelfJoin.CandidateGenerationReducer.class);

    job.setMapOutputKeyClass(Tuple.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class);

    int numReducer = job.getConfiguration().getInt("cgs.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.beymani.proximity.AverageDistance.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Nearest neighbour stat calculation  MR";
    job.setJobName(jobName);/* w  w  w.  j av  a 2 s . c o  m*/

    job.setJarByClass(AverageDistance.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(AverageDistance.TopMatchesMapper.class);
    job.setReducerClass(AverageDistance.TopMatchesReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(IdRankGroupComprator.class);
    job.setPartitionerClass(IdRankPartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.beymani.proximity.NeighborDensity.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Nearest neighbour density";
    job.setJobName(jobName);//from   www.  j  av a  2  s.  c o  m

    job.setJarByClass(NeighborDensity.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(NeighborDensity.GroupingMapper.class);
    job.setReducerClass(NeighborDensity.GroupingReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class);

    Utility.setConfiguration(job.getConfiguration());

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:org.chombo.mr.Joiner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Joiner  MR";
    job.setJobName(jobName);/*from  www .ja  va 2 s. com*/

    job.setJarByClass(Joiner.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());

    job.setMapperClass(Joiner.JoinerMapper.class);
    job.setReducerClass(Joiner.JoinerReducer.class);

    job.setMapOutputKeyClass(TextInt.class);
    job.setMapOutputValueClass(Tuple.class);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    job.setGroupingComparatorClass(SecondarySort.TextIntIdPairGroupComprator.class);
    job.setPartitionerClass(SecondarySort.TextIntIdPairTuplePartitioner.class);

    int numReducer = job.getConfiguration().getInt("joi.num.reducer", -1);
    numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer;
    job.setNumReduceTasks(numReducer);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}