Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException 

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:edu.isi.mavuno.extract.CombineSplits.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf);
    String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf);
    String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase();
    int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf);

    sLogger.info("Tool name: CombineSplits");
    sLogger.info(" - Examples path: " + examplesPath);
    sLogger.info(" - Example stats path: " + exampleStatsPath);
    sLogger.info(" - Split key: " + splitKey);
    sLogger.info(" - Total splits: " + numSplits);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("CombineSplits");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split));
    }/*  w w  w.j  av  a2  s.com*/

    if (MavunoUtils.pathExists(conf, exampleStatsPath)) {
        FileInputFormat.addInputPath(job, new Path(exampleStatsPath));
    }

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);

    if ("pattern".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else if ("context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    } else if ("pattern+context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else {
        throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey);
    }

    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.extract.Extract.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase();
    int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf));
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf);

    sLogger.info("Tool name: Extract");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor arguments: " + extractorArgs);
    sLogger.info(" - Extractor target: " + extractorTarget);
    sLogger.info(" - Min context matches: " + minContextMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("Extract");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    if ("pattern".equals(extractorTarget)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
    } else if ("context".equals(extractorTarget)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
        job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class);
    } else {//from   w  ww. ja  v a  2s  . c o m
        throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget);
    }

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf)
            .toLowerCase();//from  w w w. java 2s .c o  m
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf);

    // split examples
    conf.set("Mavuno.Split.InputPath", inputPath);
    conf.set("Mavuno.Split.OutputPath", outputPath + "/../split");
    conf.set("Mavuno.Split.SplitKey", extractorTarget);
    new Split(conf).run();

    // get splits
    FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split");
    int split = 0;
    for (FileStatus file : files) {
        if (!file.getPath().getName().endsWith(".examples")) {
            continue;
        }

        conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString());

        sLogger.info("Tool name: ExtractGlobalStats");
        sLogger.info(" - Input path: " + inputPath);
        sLogger.info(" - Examples path: " + file.getPath());
        sLogger.info(" - Example split: " + split);
        sLogger.info(" - Corpus path: " + corpusPath);
        sLogger.info(" - Corpus class: " + corpusClass);
        sLogger.info(" - Extractor class: " + extractorClass);
        sLogger.info(" - Extractor class: " + extractorArgs);
        sLogger.info(" - Extractor target: " + extractorTarget);
        sLogger.info(" - Output path: " + outputPath);

        Job job = new Job(conf);
        job.setJobName("ExtractGlobalStats");

        MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split));

        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        job.setMapOutputKeyClass(ContextPatternWritable.class);
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
        job.setMapOutputValueClass(ContextPatternStatsWritable.class);

        job.setOutputKeyClass(ContextPatternWritable.class);
        job.setOutputValueClass(ContextPatternStatsWritable.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.waitForCompletion(true);

        split++;
    }

    // combine splits
    conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split);
    conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/");
    conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath);
    new CombineGlobalStats(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/../split");

    return 0;
}

From source file:edu.isi.mavuno.extract.Split.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.Split.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Split.OutputPath", conf);
    String splitKey = MavunoUtils.getRequiredParam("Mavuno.Split.SplitKey", conf);

    sLogger.info("Tool name: Split");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Split key: " + splitKey);

    Job job = new Job(conf);
    job.setJobName("Split");

    MavunoUtils.recursivelyAddInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setNumReduceTasks(1);//from w  w  w.  jav a 2  s .co  m

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.score.GetTopResults.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.OutputPath", conf);
    int numResults = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.GetTopResults.NumResults", conf));
    boolean sequenceFileOutputFormat = conf.getBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false);

    sLogger.info("Tool name: GetTopResults");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Number of results: " + numResults);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("GetTopResults");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);

    if (sequenceFileOutputFormat) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else {/*from  ww  w .  j  a v  a  2s .co  m*/
        job.setOutputFormatClass(TextOutputFormat.class);
    }

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.score.ScoreContexts.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf);
    String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);

    sLogger.info("Tool name: ScoreContexts");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Context scorer class: " + contextScorerClass);
    sLogger.info(" - Context scorer args: " + contextScorerArgs);

    Job job = new Job(conf);
    job.setJobName("ScoreContexts");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*from   w w w  .  j a  v a2  s.co m*/

    return 0;
}

From source file:edu.isi.mavuno.score.ScorePatterns.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.OutputPath", conf);
    String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);

    sLogger.info("Tool name: ScorePatterns");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Pattern scorer class: " + patternScorerClass);
    sLogger.info(" - Pattern scorer args: " + patternScorerArgs);

    Job job = new Job(conf);
    job.setJobName("ScorePatterns");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/* w ww. j  ava  2 s. c  om*/

    return 0;
}

From source file:edu.isi.mavuno.score.UpdateWeights.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String statsPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.StatsPath", conf);
    String scoresPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ScoresPath", conf);
    String exampleType = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ExampleType", conf).toLowerCase();
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.OutputPath", conf);

    sLogger.info("Tool name: UpdateWeights");
    sLogger.info(" - Stats path: " + statsPath);
    sLogger.info(" - Scores path: " + scoresPath);
    sLogger.info(" - Example type: " + exampleType);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("UpdateWeights");

    FileInputFormat.addInputPath(job, new Path(statsPath));
    FileInputFormat.addInputPath(job, new Path(scoresPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(ContextPatternWritable.class);

    if ("pattern".equals(exampleType)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    } else if ("context".equals(exampleType)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else {/*  ww  w  .j a va  2 s .com*/
        throw new RuntimeException("Invalid ExampleType in UpdateExampleWeight -- " + exampleType);
    }

    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.umd.shrawanraina.UserLocation.java

License:Apache License

private void runJob2(String basePath, boolean useCombiner, boolean useInMapperCombiner) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);
    job.setJobName(UserLocation.class.getSimpleName());
    job.setJarByClass(UserLocation.class);

    // We need to actually count the number of part files to get the number
    // of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(basePath))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*from   w ww.j av a 2  s  .  com*/
    }
    job.setNumReduceTasks(numPartitions);

    FileInputFormat.setInputPaths(job, new Path(basePath));
    String outputPath = basePath + "-out";
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(PairOfStringInt.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setOutputKeyClass(PairOfStringInt.class);
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(MapClass2.class);
    //job.setCombinerClass(ReduceClass2.class);
    job.setReducerClass(ReduceClass2.class);

    //job.setPartitionerClass(CustomKeyPartitioner.class);
    job.setGroupingComparatorClass(CustomGroupingComparator.class);
    job.setSortComparatorClass(CustomKeyComparator.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    //return 0;
}

From source file:io.aos.mapreduce.grep.GrepTool.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        org.apache.hadoop.util.Tool t;
        return 2;
    }/*from ww  w  .  j  a v  a2  s  . co  m*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Configuration conf = getConf();
    conf.set(RegexMapper.PATTERN, args[2]);

    if (args.length == 4) {
        conf.set(RegexMapper.GROUP, args[3]);
    }

    try {

        Job greJob = Job.getInstance(conf);
        greJob.setJobName("GrepSearch");

        FileInputFormat.setInputPaths(greJob, args[0]);

        greJob.setMapperClass(RegexMapper.class);
        greJob.setCombinerClass(LongSumReducer.class);
        greJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(greJob, tempDir);
        greJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        greJob.setOutputKeyClass(Text.class);
        greJob.setOutputValueClass(LongWritable.class);

        greJob.waitForCompletion(true);

        Job sortJob = Job.getInstance(conf);
        sortJob.setJobName("GrepSort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormatClass(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        // Write a single file
        sortJob.setNumReduceTasks(1);

        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setSortComparatorClass( // sort by decreasing freq

                LongWritable.DecreasingComparator.class);

        sortJob.waitForCompletion(true);

    }

    catch (Exception e) {
        return 2;
    }

    finally {
        FileSystem.get(conf).delete(tempDir, true);
    }

    return 0;

}