List of usage examples for org.apache.hadoop.mapreduce Job setJobName
public void setJobName(String name) throws IllegalStateException
From source file:edu.gslis.ts.hadoop.ThriftBulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { String tableName = args[0];/*from w w w . j a v a 2 s .c o m*/ String inputPath = args[1]; String outputPath = args[2]; Path topicsFile = new Path(args[3]); Path vocabFile = new Path(args[4]); Path dateBinFile = new Path(args[5]); Configuration config = getConf(); config.set("hbase.table.name", tableName); HBaseConfiguration.addHbaseResources(config); Job job = Job.getInstance(config); job.setJarByClass(ThriftBulkLoader.class); job.setJobName("Bulk Loading HBase Table::" + tableName); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapperClass(ThriftFilterMapper.class); Path output = new Path(outputPath); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputDirRecursive(job, true); FileOutputFormat.setOutputPath(job, output); job.setMapOutputValueClass(Put.class); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); job.addCacheFile(dateBinFile.toUri()); job.getConfiguration().setBoolean("mapreduce.map.output.compress", true); job.getConfiguration().setClass("mapred.map.output.compression.codec", org.apache.hadoop.io.compress.SnappyCodec.class, org.apache.hadoop.io.compress.CompressionCodec.class); job.getConfiguration().set("hfile.compression", Compression.Algorithm.SNAPPY.getName()); //RegionLocator regionLocator = conn.getRegionLocator(tableName); //HFileOutputFormat2.configureIncrementalLoad(job, new HTable(config,tableName)); Connection con = ConnectionFactory.createConnection(config); TableName htableName = TableName.valueOf(tableName); HFileOutputFormat2.configureIncrementalLoad(job, con.getTable(htableName), con.getRegionLocator(htableName)); job.waitForCompletion(true); if (job.isSuccessful()) { // Couldn't find a better way to do this. The LoadIncrementalHFiles // seems to want 777 permissions on the output directory. try { Runtime rt = Runtime.getRuntime(); rt.exec("hadoop fs -chmod -R 777 " + output); } catch (Exception e) { e.printStackTrace(); } /* LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config); HTable htable = new HTable(config, tableName); loader.doBulkLoad(new Path(outputPath), htable); */ } else { throw new IOException("error with job"); } return 0; // - /* Job job = Job.getInstance(config); job.setJarByClass(ThriftBulkLoader.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setInputFormatClass(ThriftFileInputFormat.class); //HFileOutputFormat2.configureIncrementalLoad(job, htable); FileInputFormat.addInputPath(job, new Path(inputPath)); FileInputFormat.setInputDirRecursive(job, true); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.addCacheFile(topicsFile.toUri()); job.addCacheFile(vocabFile.toUri()); job.setMapperClass(ThriftFilterMapper.class); boolean b = job.waitForCompletion(true); if (!b) { throw new IOException("error with job"); } LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config); loader.doBulkLoad(new Path(outputPath), htable); return 0; */ }
From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();//from ww w .jav a 2 s .c o m } // all directories are in HDFS tokenizedDocDir = args[0]; dictDir = args[1]; outputDir = args[2]; numReducers = Integer.valueOf(args[3]); logger.info("PartialVectorsFromTokenizedDoc "); logger.info(" - tokenizedDocDir: " + tokenizedDocDir); logger.info(" - dictDir: " + dictDir); logger.info(" - outputDir: " + outputDir); logger.info(" - numReducers: " + numReducers); Path tokenizedDocPath = new Path(tokenizedDocDir); Path dictPath = new Path(dictDir); Path outputPath = new Path(outputDir); // get dimension Configuration conf = getConf(); int dimension = 0; for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true, conf)) { dimension++; } logger.info("dimension of a vector: " + dimension); // submit job long t0 = System.currentTimeMillis(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf); Job job = new Job(conf); job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir + ", dictionary-file: " + dictDir); job.setJarByClass(PartialVectorsFromTokenizedDoc.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, tokenizedDocPath); FileOutputFormat.setOutputPath(job, outputPath); HadoopUtil.delete(conf, outputPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); job.waitForCompletion(true); long t1 = System.currentTimeMillis(); logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds."); return 0; }
From source file:edu.isi.mavuno.app.ie.ExtractRelations.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String typesPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.TypesPath", conf); String primaryTypes = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PrimaryTypes", conf); String patternsPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PatternsPath", conf); String instancesPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.InstancesPath", conf); String plaintextPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.PlaintextPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.CorpusPath", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorArgs", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.OutputPath", conf); sLogger.info("Tool name: ExtractRelations"); sLogger.info(" - Types path: " + typesPath); sLogger.info(" - Primary types: " + primaryTypes); sLogger.info(" - Patterns path: " + patternsPath); if (instancesPath != null) { sLogger.info(" - Instances path: " + instancesPath); }//from www . j av a2s . c om if (plaintextPath != null) { sLogger.info(" - Plaintext path: " + plaintextPath); } sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractRelations"); FileInputFormat.addInputPath(job, new Path(corpusPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.ie.HarvestSAPInstances.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.CorpusClass", conf); int minMatches = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSAPInstances.OutputPath", conf); sLogger.info("Tool name: HarvestSAPInstances"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Minimum matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestSAPInstances"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);//from www . j a va 2s. c o m return 0; }
From source file:edu.isi.mavuno.app.ie.HarvestUDAPInstances.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestUDAPInstances.OutputPath", conf); sLogger.info("Tool name: HarvestUDAPInstances"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestUDAPInstances"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);/* w w w . j a v a 2 s . c o m*/ return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass", conf);/*from w ww .j a v a2 s . c o m*/ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf); sLogger.info("Tool name: HarvestContextPatternPairs"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestContextPatternPairs"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass", conf);/*www .java 2 s .c om*/ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs", conf); String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: HarvestParaphraseCandidates"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestParaphraseCandidates"); // harvest all (context, pattern) triples conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath); conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs); conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches); conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples"); new HarvestContextPatternPairs(conf).run(); FileInputFormat.addInputPath(job, new Path(outputPath + "/triples")); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all")); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); job.setMapOutputValueClass(TextLongPairWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); // combine scores // conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all"); // conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns"); // new CombineScores(conf).run(); // // only retain the top paraphrases conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all"); conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k"); conf.set("Mavuno.GetTopResults.NumResults", numResults); conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); new GetTopResults(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all"); return 0; }
From source file:edu.isi.mavuno.app.mine.HarvestSentences.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String patternPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.PatternPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestSentences.OutputPath", conf); sLogger.info("Tool name: HarvestSentences"); sLogger.info(" - Pattern file: " + patternPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestSentences"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0);//from w w w . jav a 2 s .c o m job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.nlp.HarvestParseGraph.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParseGraph.OutputPath", conf); sLogger.info("Tool name: HarvestParseGraph"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestParseGraph"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true);// w ww .ja v a 2 s. co m return 0; }
From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); // required parameters String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf); // optional parameters String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf); String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf); sLogger.info("Tool name: ProcessStanfordNLP"); sLogger.info(" - Input path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); if (suTime != null && Boolean.parseBoolean(suTime)) { sLogger.info("- SUTime enabled"); }//from w w w . j av a 2s . c o m boolean textOutputFormat = false; if (textOutput != null && Boolean.parseBoolean(textOutput)) { sLogger.info("- Text output format enabled"); textOutputFormat = true; } Job job = new Job(conf); job.setJobName("ProcessStanfordNLP"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); // output format -- either plain text or sequencefile (default) if (textOutputFormat) { job.setOutputFormatClass(TextOutputFormat.class); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StanfordParsedDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StanfordParsedDocument.class); job.setMapperClass(MyMapper.class); job.setJarByClass(ProcessStanfordNLP.class); // no reducers needed job.setNumReduceTasks(0); // run job job.waitForCompletion(true); // print job statistics Counters counters = job.getCounters(); sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue()); sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue()); sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue()); return 0; }