Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:csc555.ebratt.depaul.edu.GildedSorterDriver.java

License:Open Source License

/**
 * // w w  w.  ja v a2 s.c  om
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the GildedSorterReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted gild counts");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(GildedSorterMapper.class);
    job.setReducerClass(GildedSorterReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(GildedSorterReducer.class);
    }

    // sort in descending order
    job.setSortComparatorClass(LongWritable.DecreasingComparator.class);

    // The Jar file to run
    job.setJarByClass(GildedSorterDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.GildPercentDriverPass2.java

License:Open Source License

/**
 * //from  w w w. j a  v  a  2  s  .  c  o m
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the GildPercentReducerPass2.class as
 *            the combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted gild percent");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(GildPercentMapperPass2.class);
    job.setReducerClass(GildPercentReducerPass2.class);

    // Mapper output classes
    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(GildPercentReducerPass2.class);
    }

    // sort in descending order
    job.setSortComparatorClass(DoubleWritableDescendingComparator.class);

    // The Jar file to run
    job.setJarByClass(GildPercentDriverPass2.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:csc555.ebratt.depaul.edu.VoteSorterDriver.java

License:Open Source License

/**
 * /*  w  w  w.  j  av  a 2 s.  com*/
 * Runs the driver by creating a new hadoop Job based on the configuration.
 * Defines the path in/out based on the first two arguments. Allows for an
 * optional combiner based on the 4th argument.
 * 
 * @param args
 *            [0] the input directory on HDFS
 * @param args
 *            [1] the output directory on HDFS
 * @param args
 *            [2] tells the system whether or not to use a combiner ("yes")
 *            and, if so, it will use the VoteSorterReducer.class as the
 *            combiner.
 * @throws Exception
 *             if there is an issue with any of the arguments
 * 
 */
@Override
public int run(String[] args) throws Exception {

    Job job = new Job(getConf());
    StringBuffer sb = new StringBuffer();
    sb.append("sorted vote counts");
    job.setJobName(sb.toString());

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    // to ensure output is sorted
    job.setNumReduceTasks(1);

    // Mapper and Reducer Classes to use
    job.setMapperClass(VoteSorterMapper.class);
    job.setReducerClass(VoteSorterReducer.class);

    // Mapper output classes
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    // Input format class
    job.setInputFormatClass(TextInputFormat.class);

    // Reducer output classes
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    // Output format class
    job.setOutputFormatClass(TextOutputFormat.class);

    // Combiner
    if (args[2].equals("yes")) {
        job.setCombinerClass(VoteSorterReducer.class);
    }

    // sort in descending order
    job.setSortComparatorClass(LongWritable.DecreasingComparator.class);

    // The Jar file to run
    job.setJarByClass(VoteSorterDriver.class);

    boolean success = job.waitForCompletion(true);
    System.exit(success ? 0 : 1);

    return 0;
}

From source file:DataCubeRefresh.Grep.java

License:Apache License

/**
 * Run function.//from   w w w  . j a  v a 2 s .  c o  m
 * @param args arguments
 * @return error code
 * @throws Exception if an exception occurs
 */
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inUrl> <outUrl> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Job grepJob = new Job(getConf());
    Job sortJob = new Job(getConf());

    String tempStreamTag = UUID.randomUUID().toString();

    try {
        grepJob.setJobName("grep-search");

        TextHStreamingInputFormat.addInputStream(grepJob, 1000, 600, -1, "", false, args[0]);
        HStreamingJobConf.setIsStreamingJob(grepJob, true);
        grepJob.setMapperClass(RegexMapper.class);
        grepJob.getConfiguration().set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.getConfiguration().set("mapred.mapper.regex.group", args[3]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);
        grepJob.setInputFormatClass(TextHStreamingInputFormat.class);
        grepJob.setOutputFormatClass(TextHStreamingOutputFormat.class);
        HStreamingOutputFormat.setOutputStreamTag(grepJob, tempStreamTag);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);
        grepJob.setJobName("grep-search");
        grepJob.setJarByClass(this.getClass());

        grepJob.submit();

        sortJob.setJobName("grep-sort");
        sortJob.setInputFormatClass(TextHStreamingInputFormat.class);
        HStreamingJobConf.setIsStreamingJob(sortJob, true);

        // add previous stream partition/reducer 0 as input. 
        HStreamingInputFormat.addInputStreamTag(sortJob, tempStreamTag, 0);

        sortJob.setMapperClass(InverseTextMapper.class);
        sortJob.setNumReduceTasks(1); // single output stream
        sortJob.setOutputFormatClass(TextHStreamingOutputFormat.class);
        TextHStreamingOutputFormat.setOutputPath(sortJob, args[1]);
        sortJob.setSortComparatorClass( // sort by decreasing fre
                LongWritable.DecreasingComparator.class);
        sortJob.setJarByClass(this.getClass());
        sortJob.submit();

        return sortJob.waitForCompletion(true) ? 0 : 1;
    } catch (Exception e) {
        e.printStackTrace();
        try {
            grepJob.killJob();
        } catch (Exception e1) {
            // ignore
        }
        try {
            sortJob.killJob();
        } catch (Exception e2) {
            // ignore
        }
    }
    return 0;
}

From source file:demo.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);/*from   ww  w.java2  s  .c  o m*/

    return 0;
}

From source file:edu.buffalo.cse.dic.mapreduce.WordCount.java

License:Apache License

@Override
public Map<String, Number> start(String inputFile) {
    try {//  ww w.  java 2  s  .  c  o  m
        LinkedHashMap<String, Number> topTen = new LinkedHashMap<>();
        Configuration conf = new Configuration();
        conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/core-site.xml"));
        conf.addResource(new Path("/usr/local/hadoop/etc/hadoop/hdfs-site.xml"));

        FileSystem fs = FileSystem.get(new URI("wordcount"), conf);
        fs.delete(new Path("wordcount"));

        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(inputFile));
        FileOutputFormat.setOutputPath(job, new Path("wordcount"));
        job.waitForCompletion(true);
        System.out.println("word count done");

        FileSystem fsa = FileSystem.get(new URI("wordcount"), conf);
        fsa.delete(new Path("wordcountfinal"));

        Job sortJob = new Job(conf, "sort reducer");
        sortJob.setJarByClass(SortReducerOutput.class);
        sortJob.setMapperClass(OutputBreaker.class);
        sortJob.setSortComparatorClass(ReverseComparator.class);
        sortJob.setReducerClass(SortByCount.class);
        sortJob.setOutputKeyClass(IntWritable.class);
        sortJob.setOutputValueClass(Text.class);
        sortJob.setPartitionerClass(TotalOrderPartitioner.class);
        Path partitionFile = new Path("trendcount", "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(sortJob.getConfiguration(), partitionFile);
        FileInputFormat.addInputPath(sortJob, new Path("wordcount/part-r-00000"));
        FileOutputFormat.setOutputPath(sortJob, new Path("wordcountfinal"));
        sortJob.waitForCompletion(true);
        System.out.println("sort word count");

        Path output = new Path("wordcountfinal/part-r-00000");
        FileSystem fileSystem = FileSystem.get(output.toUri(), conf);
        FileStatus[] items = fileSystem.listStatus(output);
        for (FileStatus item : items) {
            InputStream stream = null;
            // ignoring files like _SUCCESS
            if (item.getPath().getName().startsWith("_")) {
                continue;
            } else {
                stream = fileSystem.open(item.getPath());
            }
            Scanner scan = new Scanner(stream).useDelimiter("\\n");
            for (int i = 0; i < 10; i++) {
                if (scan.hasNext()) {
                    String data = scan.next();
                    topTen.put(data.split("\\t")[1], Integer.parseInt(data.split("\\t")[0]));
                }
            }
        }
        return topTen;
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    } catch (InterruptedException e) {
        e.printStackTrace();
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }
    return null;
}

From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass",
            conf);//  w  w  w . j a v a 2  s. c o  m
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs",
            conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf);

    sLogger.info("Tool name: HarvestContextPatternPairs");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestContextPatternPairs");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass",
            conf);//from   w  ww  .  j  a  v  a 2s  . c  o  m
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs",
            conf);
    String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf);
    String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf);

    MavunoUtils.createDirectory(conf, outputPath);

    sLogger.info("Tool name: HarvestParaphraseCandidates");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor args: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("HarvestParaphraseCandidates");

    // harvest all (context, pattern) triples
    conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath);
    conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass);
    conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass);
    conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs);
    conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches);
    conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples");
    new HarvestContextPatternPairs(conf).run();

    FileInputFormat.addInputPath(job, new Path(outputPath + "/triples"));
    FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all"));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
    job.setMapOutputValueClass(TextLongPairWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    // combine scores
    //      conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all");
    //      conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns");
    //      new CombineScores(conf).run();
    //            
    // only retain the top paraphrases
    conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all");
    conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k");
    conf.set("Mavuno.GetTopResults.NumResults", numResults);
    conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false);
    new GetTopResults(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all");

    return 0;
}

From source file:edu.isi.mavuno.app.util.ExamplesToSequenceFile.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String contextPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.OutputPath", conf);

    sLogger.info("Tool name: ExamplesToSequenceFile");
    sLogger.info(" - Context path: " + contextPath);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("ExamplesToSequenceFile");

    FileInputFormat.addInputPath(job, new Path(contextPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*from w  ww  .j a  v a  2 s .  c om*/
    return 0;
}

From source file:edu.isi.mavuno.extract.CombineGlobalStats.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.OutputPath", conf);
    int numSplits = conf.getInt("Mavuno.CombineGlobalStats.TotalSplits", 1);

    sLogger.info("Tool name: CombineGlobalStats");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Number of splits: " + numSplits);

    Job job = new Job(conf);
    job.setJobName("CombineGlobalStats");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(inputPath + "/" + split));
    }//from  www  .  j a  va2s . c  o  m
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}