Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException 

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:csc555.ebratt.depaul.edu.GildedSorterDriver.java

License:Open Source License

/**
 * // w w  w.  ja v a2 s.c  om
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the GildedSorterReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted gild counts");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(GildedSorterMapper.class);
    job.setReducerClass(GildedSorterReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(GildedSorterReducer.class);
    }

    // sort in descending order
    job.setSortComparatorClass(LongWritable.DecreasingComparator.class);

    // The Jar file to run
    job.setJarByClass(GildedSorterDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.GildPercentDriverPass2.java

License:Open Source License

/**
 * //from  w w w. j a  v  a  2  s  .  c  o m
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the GildPercentReducerPass2.class as
 *            the combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted gild percent");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(GildPercentMapperPass2.class);
    job.setReducerClass(GildPercentReducerPass2.class);

    // Mapper output classes
    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(GildPercentReducerPass2.class);
    }

    // sort in descending order
    job.setSortComparatorClass(DoubleWritableDescendingComparator.class);

    // The Jar file to run
    job.setJarByClass(GildPercentDriverPass2.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.VoteSorterDriver.java

License:Open Source License

/**
 * /*  w  w  w.  j  av  a 2 s.  com*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the VoteSorterReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted vote counts");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(VoteSorterMapper.class);
    job.setReducerClass(VoteSorterReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(VoteSorterReducer.class);
    }

    // sort in descending order
    job.setSortComparatorClass(LongWritable.DecreasingComparator.class);

    // The Jar file to run
    job.setJarByClass(VoteSorterDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:DataCubeRefresh.Grep.java

License:Apache License

/**
 * Run function.//from   w w w  . j a  v a 2 s .  c o  m
 * @param args arguments
 * @return error code
 * @throws Exception if an exception occurs
 */
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inUrl> <outUrl> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Job grepJob = new Job(getConf());
    Job sortJob = new Job(getConf());

    String tempStreamTag = UUID.randomUUID().toString();

    try {
        grepJob.setJobName("grep-search");

        TextHStreamingInputFormat.addInputStream(grepJob, 1000, 600, -1, "", false, args[0]);
        HStreamingJobConf.setIsStreamingJob(grepJob, true);
        grepJob.setMapperClass(RegexMapper.class);
        grepJob.getConfiguration().set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.getConfiguration().set("mapred.mapper.regex.group", args[3]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);
        grepJob.setInputFormatClass(TextHStreamingInputFormat.class);
        grepJob.setOutputFormatClass(TextHStreamingOutputFormat.class);
        HStreamingOutputFormat.setOutputStreamTag(grepJob, tempStreamTag);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);
        grepJob.setJobName("grep-search");
        grepJob.setJarByClass(this.getClass());

        grepJob.submit();

        sortJob.setJobName("grep-sort");
        sortJob.setInputFormatClass(TextHStreamingInputFormat.class);
        HStreamingJobConf.setIsStreamingJob(sortJob, true);

        // add previous stream partition/reducer 0 as input. 
        HStreamingInputFormat.addInputStreamTag(sortJob, tempStreamTag, 0);

        sortJob.setMapperClass(InverseTextMapper.class);
        sortJob.setNumReduceTasks(1); // single output stream
        sortJob.setOutputFormatClass(TextHStreamingOutputFormat.class);
        TextHStreamingOutputFormat.setOutputPath(sortJob, args[1]);
        sortJob.setSortComparatorClass( // sort by decreasing fre
                LongWritable.DecreasingComparator.class);
        sortJob.setJarByClass(this.getClass());
        sortJob.submit();

        return sortJob.waitForCompletion(true) ? 0 : 1;
    } catch (Exception e) {
        e.printStackTrace();
        try {
            grepJob.killJob();
        } catch (Exception e1) {
            // ignore
        }
        try {
            sortJob.killJob();
        } catch (Exception e2) {
            // ignore
        }
    }
    return 0;
}

From source file:demo.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);/*from   ww  w.java2  s  .c  o m*/

    return 0;
}

From source file:edu.buffalo.cse.dic.mapreduce.WordCount.java

License:Apache License

@Override
public Map<String, Number> start(String inputFile) {
    try {//  ww w.  java 2  s  .  c  o  m
        LinkedHashMap<String, Number> topTen = new LinkedHashMap<>();
        Configuration conf = new Configuration();
        conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/core-site.xml"));
        conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/hdfs-site.xml"));

        FileSystem fs = FileSystem.get(new URI("wordcount"), conf);
        fs.delete(new Path("wordcount"));

        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(inputFile));
        FileOutputFormat.setOutputPath(job, new Path("wordcount"));
        job.waitForCompletion(true);
        System.out.println("word count done");

        FileSystem fsa = FileSystem.get(new URI("wordcount"), conf);
        fsa.delete(new Path("wordcountfinal"));

        Job sortJob = new Job(conf, "sort reducer");
        sortJob.setJarByClass(SortReducerOutput.class);
        sortJob.setMapperClass(OutputBreaker.class);
        sortJob.setSortComparatorClass(ReverseComparator.class);
        sortJob.setReducerClass(SortByCount.class);
        sortJob.setOutputKeyClass(IntWritable.class);
        sortJob.setOutputValueClass(Text.class);
        sortJob.setPartitionerClass(TotalOrderPartitioner.class);
        Path partitionFile = new Path("trendcount", "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(sortJob.getConfiguration(), partitionFile);
        FileInputFormat.addInputPath(sortJob, new Path("wordcount/part-r-00000"));
        FileOutputFormat.setOutputPath(sortJob, new Path("wordcountfinal"));
        sortJob.waitForCompletion(true);
        System.out.println("sort word count");

        Path output = new Path("wordcountfinal/part-r-00000");
        FileSystem fileSystem = FileSystem.get(output.toUri(), conf);
        FileStatus[] items = fileSystem.listStatus(output);
        for (FileStatus item : items) {
            InputStream stream = null;
            // ignoring files like _SUCCESS
            if (item.getPath().getName().startsWith("_")) {
                continue;
            } else {
                stream = fileSystem.open(item.getPath());
            }
            Scanner scan = new Scanner(stream).useDelimiter("\\n");
            for (int i = 0; i < 10; i++) {
                if (scan.hasNext()) {
                    String data = scan.next();
                    topTen.put(data.split("\\t")[1], Integer.parseInt(data.split("\\t")[0]));
                }
            }
        }
        return topTen;
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }
    return null;
}

From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass",
            conf);//  w  w  w . j a v a 2  s. c o  m
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs",
            conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf);

    sLogger.info("Tool name: HarvestContextPatternPairs");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestContextPatternPairs");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass",
            conf);//from   w  ww  .  j  a  v  a 2s  . c  o  m
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs",
            conf);
    String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf);

    MavunoUtils.createDirectory(conf, outputPath);

    sLogger.info("Tool name: HarvestParaphraseCandidates");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestParaphraseCandidates");

    // harvest all (context, pattern) triples
    conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath);
    conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass);
    conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass);
    conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs);
    conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches);
    conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples");
    new HarvestContextPatternPairs(conf).run();

    FileInputFormat.addInputPath(job, new Path(outputPath + "/triples"));
    FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all"));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
    job.setMapOutputValueClass(TextLongPairWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    // combine scores
    //      conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all");
    //      conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns");
    //      new CombineScores(conf).run();
    //            
    // only retain the top paraphrases
    conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all");
    conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k");
    conf.set("Mavuno.GetTopResults.NumResults", numResults);
    conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false);
    new GetTopResults(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all");

    return 0;
}

From source file:edu.isi.mavuno.app.util.ExamplesToSequenceFile.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String contextPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.OutputPath", conf);

    sLogger.info("Tool name: ExamplesToSequenceFile");
    sLogger.info(" - Context path: " + contextPath);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("ExamplesToSequenceFile");

    FileInputFormat.addInputPath(job, new Path(contextPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*from w  ww  .j a  v a  2 s .  c om*/
    return 0;
}

From source file:edu.isi.mavuno.extract.CombineGlobalStats.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.OutputPath", conf);
    int numSplits = conf.getInt("Mavuno.CombineGlobalStats.TotalSplits", 1);

    sLogger.info("Tool name: CombineGlobalStats");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Number of splits: " + numSplits);

    Job job = new Job(conf);
    job.setJobName("CombineGlobalStats");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(inputPath + "/" + split));
    }//from  www  .  j a  va2s . c  o  m
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}