List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException
From source file:edu.indiana.soic.ts.mapreduce.BulkDataLoader.java
License:Apache License
public static Job configureInsertAllJob(Configuration configuration, TSConfiguration tsConfiguration) throws IOException { Job job = new Job(configuration, "Bulk Import data"); job.setJarByClass(InsertAllMapper.class); job.setMapperClass(InsertAllMapper.class); TableMapReduceUtil.initTableReducerJob(Constants.STOCK_TABLE_NAME, InsertReducer.class, job); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(tsConfiguration.getInputDir())); FileOutputFormat.setOutputPath(job, new Path(Constants.HDFS_OUTPUT_PATH)); return job;/*from w w w .j a v a2 s . co m*/ }
From source file:edu.indiana.soic.ts.mapreduce.DateLoader.java
License:Apache License
public static Job configureInsertAllJob(Configuration configuration, TSConfiguration tsConfiguration) throws IOException { Job job = new Job(configuration, "HBase Date Table"); job.setJarByClass(InsertDateMapper.class); job.setMapperClass(InsertDateMapper.class); TableMapReduceUtil.initTableReducerJob(Constants.STOCK_DATES_TABLE, InsertDateReducer.class, job); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path(tsConfiguration.getInputDir())); FileOutputFormat.setOutputPath(job, new Path(Constants.HDFS_OUTPUT_PATH)); return job;// w w w .j a v a2 s. c o m }
From source file:edu.isi.mavuno.app.ie.ExtractRelations.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String typesPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.TypesPath", conf); String primaryTypes = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PrimaryTypes", conf); String patternsPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PatternsPath", conf); String instancesPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.InstancesPath", conf); String plaintextPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.PlaintextPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.CorpusPath", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorArgs", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.OutputPath", conf); sLogger.info("Tool name: ExtractRelations"); sLogger.info(" - Types path: " + typesPath); sLogger.info(" - Primary types: " + primaryTypes); sLogger.info(" - Patterns path: " + patternsPath); if (instancesPath != null) { sLogger.info(" - Instances path: " + instancesPath); }//from w w w . j a va 2 s. c o m if (plaintextPath != null) { sLogger.info(" - Plaintext path: " + plaintextPath); } sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractRelations"); FileInputFormat.addInputPath(job, new Path(corpusPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.ie.HarvestSAPInstances.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusClass", conf); int minMatches = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.OutputPath", conf); sLogger.info("Tool name: HarvestSAPInstances"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Minimum matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestSAPInstances"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);//from w ww . ja v a 2 s. c o m return 0; }
From source file:edu.isi.mavuno.app.ie.HarvestUDAPInstances.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.OutputPath", conf); sLogger.info("Tool name: HarvestUDAPInstances"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestUDAPInstances"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/*from w w w . j a va 2 s . c o m*/ return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass", conf);//from w w w . j a v a2 s .co m String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf); sLogger.info("Tool name: HarvestContextPatternPairs"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestContextPatternPairs"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass", conf);//from ww w . ja v a2 s . co m String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs", conf); String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: HarvestParaphraseCandidates"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestParaphraseCandidates"); // harvest all (context, pattern) triples conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath); conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs); conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches); conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples"); new HarvestContextPatternPairs(conf).run(); FileInputFormat.addInputPath(job, new Path(outputPath + "/triples")); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all")); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); job.setMapOutputValueClass(TextLongPairWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); // combine scores // conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all"); // conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns"); // new CombineScores(conf).run(); // // only retain the top paraphrases conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all"); conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k"); conf.set("Mavuno.GetTopResults.NumResults", numResults); conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); new GetTopResults(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all"); return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestSentences.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String patternPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.PatternPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.OutputPath", conf); sLogger.info("Tool name: HarvestSentences"); sLogger.info(" - Pattern file: " + patternPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestSentences"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0);// www.j a v a 2s . co m job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.nlp.HarvestParseGraph.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.OutputPath", conf); sLogger.info("Tool name: HarvestParseGraph"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestParseGraph"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);//from w w w . j a v a2 s. c om return 0; }
From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); // required parameters String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf); // optional parameters String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf); String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf); sLogger.info("Tool name: ProcessStanfordNLP"); sLogger.info(" - Input path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); if (suTime != null && Boolean.parseBoolean(suTime)) { sLogger.info("- SUTime enabled"); }/*from w w w . j a va 2 s .c om*/ boolean textOutputFormat = false; if (textOutput != null && Boolean.parseBoolean(textOutput)) { sLogger.info("- Text output format enabled"); textOutputFormat = true; } Job job = new Job(conf); job.setJobName("ProcessStanfordNLP"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); // output format -- either plain text or sequencefile (default) if (textOutputFormat) { job.setOutputFormatClass(TextOutputFormat.class); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StanfordParsedDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StanfordParsedDocument.class); job.setMapperClass(MyMapper.class); job.setJarByClass(ProcessStanfordNLP.class); // no reducers needed job.setNumReduceTasks(0); // run job job.waitForCompletion(true); // print job statistics Counters counters = job.getCounters(); sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue()); sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue()); sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue()); return 0; }