List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass
public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:edu.isi.mavuno.extract.CombineSplits.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf); String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf); String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase(); int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1); String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf); sLogger.info("Tool name: CombineSplits"); sLogger.info(" - Examples path: " + examplesPath); sLogger.info(" - Example stats path: " + exampleStatsPath); sLogger.info(" - Split key: " + splitKey); sLogger.info(" - Total splits: " + numSplits); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("CombineSplits"); for (int split = 0; split < numSplits; split++) { FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split)); }/* w w w.j av a2 s.com*/ if (MavunoUtils.pathExists(conf, exampleStatsPath)) { FileInputFormat.addInputPath(job, new Path(exampleStatsPath)); } FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); if ("pattern".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else if ("context".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); } else if ("pattern+context".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else { throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey); } job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.Extract.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase(); int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf); sLogger.info("Tool name: Extract"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Min context matches: " + minContextMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("Extract"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); if ("pattern".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); } else if ("context".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class); } else {//from w ww. ja v a 2s . c o m throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf) .toLowerCase();//from w w w. java 2s .c o m String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf); // split examples conf.set("Mavuno.Split.InputPath", inputPath); conf.set("Mavuno.Split.OutputPath", outputPath + "/../split"); conf.set("Mavuno.Split.SplitKey", extractorTarget); new Split(conf).run(); // get splits FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split"); int split = 0; for (FileStatus file : files) { if (!file.getPath().getName().endsWith(".examples")) { continue; } conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString()); sLogger.info("Tool name: ExtractGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Examples path: " + file.getPath()); sLogger.info(" - Example split: " + split); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor class: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractGlobalStats"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); split++; } // combine splits conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split); conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/"); conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath); new CombineGlobalStats(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/../split"); return 0; }
From source file:edu.isi.mavuno.extract.Split.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Split.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Split.OutputPath", conf); String splitKey = MavunoUtils.getRequiredParam("Mavuno.Split.SplitKey", conf); sLogger.info("Tool name: Split"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Split key: " + splitKey); Job job = new Job(conf); job.setJobName("Split"); MavunoUtils.recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1);//from w w w. jav a 2 s .co m job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.score.GetTopResults.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.OutputPath", conf); int numResults = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.GetTopResults.NumResults", conf)); boolean sequenceFileOutputFormat = conf.getBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); sLogger.info("Tool name: GetTopResults"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Number of results: " + numResults); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("GetTopResults"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); if (sequenceFileOutputFormat) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else {/*from ww w . j a v a 2s .co m*/ job.setOutputFormatClass(TextOutputFormat.class); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.score.ScoreContexts.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf); String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf); String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf); sLogger.info("Tool name: ScoreContexts"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Context scorer class: " + contextScorerClass); sLogger.info(" - Context scorer args: " + contextScorerArgs); Job job = new Job(conf); job.setJobName("ScoreContexts"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ScoreWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/*from w w w . j a v a2 s.co m*/ return 0; }
From source file:edu.isi.mavuno.score.ScorePatterns.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.OutputPath", conf); String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf); String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf); sLogger.info("Tool name: ScorePatterns"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Pattern scorer class: " + patternScorerClass); sLogger.info(" - Pattern scorer args: " + patternScorerArgs); Job job = new Job(conf); job.setJobName("ScorePatterns"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ScoreWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/* w ww. j ava 2 s. c om*/ return 0; }
From source file:edu.isi.mavuno.score.UpdateWeights.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String statsPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.StatsPath", conf); String scoresPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ScoresPath", conf); String exampleType = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ExampleType", conf).toLowerCase(); String outputPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.OutputPath", conf); sLogger.info("Tool name: UpdateWeights"); sLogger.info(" - Stats path: " + statsPath); sLogger.info(" - Scores path: " + scoresPath); sLogger.info(" - Example type: " + exampleType); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("UpdateWeights"); FileInputFormat.addInputPath(job, new Path(statsPath)); FileInputFormat.addInputPath(job, new Path(scoresPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); if ("pattern".equals(exampleType)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); } else if ("context".equals(exampleType)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else {/* ww w .j a va 2 s .com*/ throw new RuntimeException("Invalid ExampleType in UpdateExampleWeight -- " + exampleType); } job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.umd.shrawanraina.UserLocation.java
License:Apache License
private void runJob2(String basePath, boolean useCombiner, boolean useInMapperCombiner) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJobName(UserLocation.class.getSimpleName()); job.setJarByClass(UserLocation.class); // We need to actually count the number of part files to get the number // of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(basePath))) { if (s.getPath().getName().contains("part-")) numPartitions++;/*from w ww.j av a 2 s . com*/ } job.setNumReduceTasks(numPartitions); FileInputFormat.setInputPaths(job, new Path(basePath)); String outputPath = basePath + "-out"; FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStringInt.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(PairOfStringInt.class); job.setOutputValueClass(NullWritable.class); job.setMapperClass(MapClass2.class); //job.setCombinerClass(ReduceClass2.class); job.setReducerClass(ReduceClass2.class); //job.setPartitionerClass(CustomKeyPartitioner.class); job.setGroupingComparatorClass(CustomGroupingComparator.class); job.setSortComparatorClass(CustomKeyComparator.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //return 0; }
From source file:io.aos.mapreduce.grep.GrepTool.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); org.apache.hadoop.util.Tool t; return 2; }/*from ww w . j a v a2 s . co m*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) { conf.set(RegexMapper.GROUP, args[3]); } try { Job greJob = Job.getInstance(conf); greJob.setJobName("GrepSearch"); FileInputFormat.setInputPaths(greJob, args[0]); greJob.setMapperClass(RegexMapper.class); greJob.setCombinerClass(LongSumReducer.class); greJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(greJob, tempDir); greJob.setOutputFormatClass(SequenceFileOutputFormat.class); greJob.setOutputKeyClass(Text.class); greJob.setOutputValueClass(LongWritable.class); greJob.waitForCompletion(true); Job sortJob = Job.getInstance(conf); sortJob.setJobName("GrepSort"); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); // Write a single file sortJob.setNumReduceTasks(1); FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } catch (Exception e) { return 2; } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }