Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:nl.sanoma.hdt.report.generator.ReportGeneratorDriver.java

License:Open Source License

/**
 * Job to join the data and the metadata from distributed cache and
 * calculate the revenue by quarter and most popular product category for user
 *
 * @param dBPath the path of the import MapFile
 * @param inputPath the path of the logs directory
 * @param outputPath the path of the output directory
 * @return returns the exitCode of the job
 * @throws IOException//  w  w  w. j a  v  a  2s  .  c o  m
 * @throws URISyntaxException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Boolean generateReport(String dBPath, String inputPath, String outputPath)
        throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
    Job job = new Job(getConf());
    Configuration conf = job.getConfiguration();

    job.setJobName("Repor Generator");
    DistributedCache.addCacheFile(new URI(dBPath), conf);
    job.setJarByClass(ReportGeneratorDriver.class);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setPartitionerClass(KeyDataPartitioner.class);
    job.setGroupingComparatorClass(KeyDataGroupingComparator.class);
    job.setSortComparatorClass(KeyDataComparator.class);
    job.setMapperClass(ReportGeneratorMapper.class);
    job.setMapOutputKeyClass(KeyData.class);
    job.setMapOutputValueClass(ValueData.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setReducerClass(ReportGeneratorReducer.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true);
}

From source file:org.acacia.csr.java.CSRConverter.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (!validArgs(args)) {
        printUsage();//from   ww  w .  ja  v a2 s  .  c  o m
        return;
    }
    //These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/csrconverter-output";
    String dir2 = "/user/miyuru/csrconverter-output-sorted";

    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    System.out.println("Deleting the dir : " + dir1);

    if (fs1.exists(new Path(dir1))) {
        fs1.delete(new Path(dir1), true);
    }

    System.out.println("Done deleting the dir : " + dir1);
    System.out.println("Deleting the dir : " + dir2);
    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }

    Path notinPath = new Path("/user/miyuru/notinverts/notinverts");

    if (!fs1.exists(notinPath)) {
        fs1.create(notinPath);
    }

    System.out.println("Done deleting the dir : " + dir2);

    //Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why.

    VertexCounterClient.setDefaultGraphID(args[3], args[2]);

    //First job creates the inverted index

    JobConf conf = new JobConf(CSRConverter.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]);
    conf.set("org.acacia.partitioner.hbase.table", args[2]);
    conf.set("org.acacia.partitioner.hbase.contacthost", args[3]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    //conf.setMapperClass(InvertedMapper.class);
    conf.setReducerClass(InvertedReducer.class);
    //conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //FileInputFormat.setInputPaths(conf, new Path(args[0]));
    MultipleInputs.addInputPath(conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class);
    MultipleInputs.addInputPath(conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class,
            InvertedMapper.class);
    FileOutputFormat.setOutputPath(conf, new Path(dir1));

    //Also for the moment we turn-off the speculative execution
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setNumMapTasks(96);
    conf.setNumReduceTasks(96);
    conf.setPartitionerClass(VertexPartitioner.class);
    conf.set("vertex-count", args[4]);
    conf.set("zero-flag", args[5]);
    Job job = new Job(conf, "csr_inverter");
    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
}

From source file:org.acacia.csr.java.NotInFinder.java

License:Apache License

public static void main(String[] args) throws Exception {
    String dir1 = "/user/miyuru/wcout";
    String dir2 = "/user/miyuru/notinverts";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }// www .java2s  .com

    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(LongWritable.class);
    conf.setMapperClass(TokenizerMapper.class);
    conf.setReducerClass(IntSumReducer.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(dir1));
    FileOutputFormat.setOutputPath(conf, new Path(dir2));
    Job job = new Job(conf, "NotInFinder");
    job.setJarByClass(WordCount.class);
    //   job.setMapperClass(TokenizerMapper.class);
    //   job.setCombinerClass(IntSumReducer.class);
    //   job.setReducerClass(IntSumReducer.class);
    //   job.setOutputKeyClass(LongWritable.class);
    //   job.setOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);

}

From source file:org.acacia.csr.java.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*/*from w  ww.j a va 2  s  . c  o  m*/
    String dir1 = "/user/miyuru/wcout";
     //We first delete the temporary directories if they exist on the HDFS
      FileSystem fs1 = FileSystem.get(new JobConf());
              
     if(fs1.exists(new Path(dir1))){
        fs1.delete(new Path(dir1), true);
     }
            
    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);
            
    job.setSortComparatorClass(SortComparator.class);
    FileInputFormat.addInputPath(job, new Path("/user/miyuru/input"));
    FileOutputFormat.setOutputPath(job, new Path(dir1));
    job.waitForCompletion(true); 
    */

    String dir3 = "/user/miyuru/wcout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs3 = FileSystem.get(new JobConf());

    if (fs3.exists(new Path(dir3))) {
        fs3.delete(new Path(dir3), true);
    }

    JobConf conf3 = new JobConf();
    conf3.setNumMapTasks(96);
    FileInputFormat.addInputPath(conf3, new Path(dir5));
    FileOutputFormat.setOutputPath(conf3, new Path(dir3));
    Job job3 = new Job(conf3, "word count");
    job3.setJarByClass(WordCount.class);
    job3.setMapperClass(TokenizerMapper.class);
    job3.setCombinerClass(IntSumReducer.class);
    job3.setReducerClass(IntSumReducer.class);
    job3.setOutputKeyClass(LongWritable.class);
    job3.setOutputValueClass(LongWritable.class);

    job3.setSortComparatorClass(SortComparator.class);

    job3.waitForCompletion(true);

    PrintWriter writer;
    try {
        writer = new PrintWriter("/tmp/wfile", "UTF-8");
        writer.println("");
        writer.flush();
        writer.close();
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (UnsupportedEncodingException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    System.out.println("------Done Word Count---------------");

}

From source file:org.acacia.csr.java.ZeroVertexSearcher.java

License:Apache License

public static void main(String[] args) throws Exception {
    /*//from  www .j  a v a  2s.  c  o m
    String dir1 = "/user/miyuru/wcout";
     //We first delete the temporary directories if they exist on the HDFS
      FileSystem fs1 = FileSystem.get(new JobConf());
              
     if(fs1.exists(new Path(dir1))){
        fs1.delete(new Path(dir1), true);
     }
            
    JobConf conf = new JobConf();
    conf.setNumMapTasks(96);
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(LongWritable.class);
            
    job.setSortComparatorClass(SortComparator.class);
    FileInputFormat.addInputPath(job, new Path("/user/miyuru/input"));
    FileOutputFormat.setOutputPath(job, new Path(dir1));
    job.waitForCompletion(true); 
    */

    String dir3 = "/user/miyuru/zout";
    String dir5 = "/user/miyuru/input";
    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs3 = FileSystem.get(new JobConf());

    if (fs3.exists(new Path(dir3))) {
        fs3.delete(new Path(dir3), true);
    }

    JobConf conf3 = new JobConf();
    conf3.setNumMapTasks(96);
    FileInputFormat.addInputPath(conf3, new Path(dir5));
    FileOutputFormat.setOutputPath(conf3, new Path(dir3));
    conf3.set("mapred.map.max.attempts", "0");//If the job fails we assume that it happens because we found zero. Therfore we do not attempt again.
    Job job3 = new Job(conf3, "zero_vertex_search");
    job3.setJarByClass(ZeroVertexSearcher.class);
    job3.setMapperClass(TokenizerMapper.class);
    job3.setCombinerClass(IntSumReducer.class);
    job3.setReducerClass(IntSumReducer.class);
    job3.setOutputKeyClass(LongWritable.class);
    job3.setOutputValueClass(LongWritable.class);
    job3.setNumReduceTasks(0);

    job3.setSortComparatorClass(SortComparator.class);
    try {
        job3.waitForCompletion(true);
    } catch (org.acacia.csr.java.ZeroFoundException ex) {
        System.out.println("Found Zero vertex");
        job3.killJob();
    }
    System.out.println("------Done Zero Vertex search---------------");

}

From source file:org.apache.avro.mapreduce.AvroJob.java

License:Apache License

/**
 * Sets the map output key schema./*  ww  w .  j a v  a  2  s  .c om*/
 *
 * @param job The job to configure.
 * @param schema The map output key schema.
 */
public static void setMapOutputKeySchema(Job job, Schema schema) {
    job.setMapOutputKeyClass(AvroKey.class);
    job.setGroupingComparatorClass(AvroKeyComparator.class);
    job.setSortComparatorClass(AvroKeyComparator.class);
    AvroSerialization.setKeyWriterSchema(job.getConfiguration(), schema);
    AvroSerialization.setKeyReaderSchema(job.getConfiguration(), schema);
    AvroSerialization.addToConfiguration(job.getConfiguration());
}

From source file:org.apache.crunch.GroupingOptions.java

License:Apache License

public void configure(Job job) {
    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }//from  w  w w .  ja  va2s.c  o m
    if (groupingComparatorClass != null) {
        job.setGroupingComparatorClass(groupingComparatorClass);
    }
    if (sortComparatorClass != null) {
        job.setSortComparatorClass(sortComparatorClass);
    }
    if (numReducers > 0) {
        job.setNumReduceTasks(numReducers);
    }
    for (Map.Entry<String, String> e : extraConf.entrySet()) {
        job.getConfiguration().set(e.getKey(), e.getValue());
    }
}

From source file:org.apache.crunch.types.avro.AvroGroupedTableType.java

License:Apache License

@Override
public void configureShuffle(Job job, GroupingOptions options) {
    AvroTableType<K, V> att = (AvroTableType<K, V>) tableType;
    String schemaJson = att.getSchema().toString();
    Configuration conf = job.getConfiguration();

    if (att.hasReflect()) {
        if (att.hasSpecific()) {
            Avros.checkCombiningSpecificAndReflectionSchemas();
        }//w w w .j a v a2  s.  co  m
        conf.setBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, true);
    }
    conf.set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson);
    job.setSortComparatorClass(AvroKeyComparator.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(AvroValue.class);
    if (options != null) {
        options.configure(job);
    }

    Avros.configureReflectDataFactory(conf);

    Collection<String> serializations = job.getConfiguration().getStringCollection("io.serializations");
    if (!serializations.contains(SafeAvroSerialization.class.getName())) {
        serializations.add(SafeAvroSerialization.class.getName());
        job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0]));
    }
}

From source file:org.apache.druid.indexer.SortableBytes.java

License:Apache License

public static void useSortableBytesAsMapOutputKey(Job job, Class<? extends Partitioner> partitionerClass) {
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setGroupingComparatorClass(SortableBytesGroupingComparator.class);
    job.setSortComparatorClass(SortableBytesSortingComparator.class);
    job.setPartitionerClass(partitionerClass);
}

From source file:org.apache.gobblin.compaction.mapreduce.CompactionOrcJobConfigurator.java

License:Apache License

protected void configureMapper(Job job) {
    job.setInputFormatClass(OrcValueCombineFileInputFormat.class);
    job.setMapperClass(OrcValueMapper.class);
    job.setMapOutputKeyClass(OrcKey.class);
    job.setMapOutputValueClass(OrcValue.class);
    job.setGroupingComparatorClass(OrcKeyComparator.class);
    job.setSortComparatorClass(OrcKeyComparator.class);
}