List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:edu.isi.mavuno.app.util.ExamplesToSequenceFile.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String contextPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExamplesToSequenceFile.OutputPath", conf); sLogger.info("Tool name: ExamplesToSequenceFile"); sLogger.info(" - Context path: " + contextPath); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExamplesToSequenceFile"); FileInputFormat.addInputPath(job, new Path(contextPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);//from www. j a v a 2 s. c o m return 0; }
From source file:edu.isi.mavuno.extract.Extract.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase(); int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf); sLogger.info("Tool name: Extract"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Min context matches: " + minContextMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("Extract"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); if ("pattern".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); } else if ("context".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class); } else {//from w w w .j a v a 2s .c om throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf) .toLowerCase();/* ww w .j a va2 s .c om*/ String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf); // split examples conf.set("Mavuno.Split.InputPath", inputPath); conf.set("Mavuno.Split.OutputPath", outputPath + "/../split"); conf.set("Mavuno.Split.SplitKey", extractorTarget); new Split(conf).run(); // get splits FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split"); int split = 0; for (FileStatus file : files) { if (!file.getPath().getName().endsWith(".examples")) { continue; } conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString()); sLogger.info("Tool name: ExtractGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Examples path: " + file.getPath()); sLogger.info(" - Example split: " + split); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor class: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractGlobalStats"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); split++; } // combine splits conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split); conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/"); conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath); new CombineGlobalStats(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/../split"); return 0; }
From source file:edu.isi.mavuno.extract.Split.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Split.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Split.OutputPath", conf); String splitKey = MavunoUtils.getRequiredParam("Mavuno.Split.SplitKey", conf); sLogger.info("Tool name: Split"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Split key: " + splitKey); Job job = new Job(conf); job.setJobName("Split"); MavunoUtils.recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1);//from www. ja va2s .co m job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.score.GetTopResults.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.GetTopResults.OutputPath", conf); int numResults = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.GetTopResults.NumResults", conf)); boolean sequenceFileOutputFormat = conf.getBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); sLogger.info("Tool name: GetTopResults"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Number of results: " + numResults); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("GetTopResults"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); if (sequenceFileOutputFormat) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else {//from w w w .ja va2 s .co m job.setOutputFormatClass(TextOutputFormat.class); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.score.ScoreContexts.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf); String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf); String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf); sLogger.info("Tool name: ScoreContexts"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Context scorer class: " + contextScorerClass); sLogger.info(" - Context scorer args: " + contextScorerArgs); Job job = new Job(conf); job.setJobName("ScoreContexts"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ScoreWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/*from w w w . j a va 2 s . co m*/ return 0; }
From source file:edu.isi.mavuno.score.ScorePatterns.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.OutputPath", conf); String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf); String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf); sLogger.info("Tool name: ScorePatterns"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Pattern scorer class: " + patternScorerClass); sLogger.info(" - Pattern scorer args: " + patternScorerArgs); Job job = new Job(conf); job.setJobName("ScorePatterns"); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ScoreWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/*from w w w .ja v a 2s.co m*/ return 0; }
From source file:edu.isi.mavuno.score.UpdateWeights.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String statsPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.StatsPath", conf); String scoresPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ScoresPath", conf); String exampleType = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.ExampleType", conf).toLowerCase(); String outputPath = MavunoUtils.getRequiredParam("Mavuno.UpdateWeights.OutputPath", conf); sLogger.info("Tool name: UpdateWeights"); sLogger.info(" - Stats path: " + statsPath); sLogger.info(" - Scores path: " + scoresPath); sLogger.info(" - Example type: " + exampleType); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("UpdateWeights"); FileInputFormat.addInputPath(job, new Path(statsPath)); FileInputFormat.addInputPath(job, new Path(scoresPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); if ("pattern".equals(exampleType)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); } else if ("context".equals(exampleType)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else {//from w w w . j ava 2 s . co m throw new RuntimeException("Invalid ExampleType in UpdateExampleWeight -- " + exampleType); } job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/*from www . j a v a2 s . c o m*/ */ @SuppressWarnings("deprecation") private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:edu.umd.cloud9.collection.wikipedia.ExtractWikipediaAnchorTextWithWindow.java
License:Apache License
private void task1(String inputPath, String outputPath) throws IOException, ClassNotFoundException, InterruptedException { LOG.info("Exracting anchor text (phase 1)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Job job = Job.getInstance(getConf()); job.setJarByClass(ExtractWikipediaAnchorTextWithWindow.class); job.setJobName(//from www .j ava 2 s .c om String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath)); // 10 reducers is reasonable. job.setNumReduceTasks(10); // increase heap job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set("mapreduce.map.memory.mb", "6144"); job.getConfiguration().set("mapreduce.reduce.memory.mb", "6144"); job.getConfiguration().set("mapreduce.map.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.reduce.java.opts", "-Xmx6144m"); job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(PairOfStringInt.class); job.setMapOutputValueClass(PairOfStrings.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PairOfIntString.class); job.setMapperClass(MyMapper1.class); job.setReducerClass(MyReducer1.class); job.setPartitionerClass(MyPartitioner1.class); // Delete the output directory if it exists already. FileSystem.get(job.getConfiguration()).delete(new Path(outputPath), true); job.waitForCompletion(true); }