List of usage examples for org.apache.hadoop.mapreduce Job setJarByClass
public void setJarByClass(Class<?> cls)
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * //from w w w .j ava 2 s.c o m * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param analyzerClass * The Lucene {@link Analyzer} for tokenizing the UTF-8 text */ public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(ANALYZER_CLASS, analyzerClass.getName()); Job job = new Job(conf); job.setJobName("DocumentProcessor::LuceneTokenizer: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(LuceneTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
public static void dumpLabels(Path input, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values//from w w w . j av a 2 s. com conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = new Job(conf); job.setJobName("DocumentProcessor::LabelDumper: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(BehemothLabelMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.digitalpebble.behemoth.mahout.DocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * //from w ww .j a va 2 s . c o m * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param type * The annotation type representing the tokens * @param feature * The name of the features holding the token value * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void tokenizeDocuments(Path input, String type, String feature, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(TOKEN_TYPE, type); conf.set(FEATURE_NAME, feature); Job job = new Job(conf); job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input); job.setJarByClass(DocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(SequenceFileTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); job.waitForCompletion(true); }
From source file:com.dlyle.testh20.H2OTest.java
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); Job job = Job.getInstance(conf, "H2O Test"); job.setJarByClass(H2OTest.class); job.setMapperClass(H2OMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.elephantscale.hbase.book.chapter1.SimpleMR.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: SimpleMR <in> <out>"); return;// w ww . j a v a 2 s . c o m } Job job = new Job(conf, "SimpleMR"); job.setJarByClass(SimpleMR.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); job.waitForCompletion(true); }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CachingCVB0PerplexityMapper.class); job.setMapperClass(CachingCVB0PerplexityMapper.class); job.setCombinerClass(DualDoubleSumReducer.class); job.setReducerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1);/*from ww w.j ava2 s .com*/ job.setOutputKeyClass(DoubleWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusPath); Path outputPath = perplexityPath(modelPath.getParent(), iteration); FileOutputFormat.setOutputPath(job, outputPath); setModelPaths(job, modelPath); HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeTopicModel(Configuration conf, Path modelInput, Path output) throws IOException, InterruptedException, ClassNotFoundException { String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class); job.setNumReduceTasks(0);// w w w . j a v a 2 s .co m job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, modelInput); FileOutputFormat.setOutputPath(job, output); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0);/*from w w w.j a va 2 s .co m*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); } DistributedCache.setCacheFiles(modelUris, conf); } setModelPaths(job, modelInput);//bug:mahout-1147 FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput, int iterationNumber, int maxIterations, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations, modelInput);/*from www.j a va 2 s .co m*/ log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setMapperClass(CachingCVB0Mapper.class); job.setCombinerClass(VectorSumReducer.class); job.setReducerClass(VectorSumReducer.class); job.setNumReduceTasks(numReduceTasks); job.setOutputKeyClass(Text.class);//0.7IntWritable job.setOutputValueClass(VectorWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusInput); FileOutputFormat.setOutputPath(job, modelOutput); setModelPaths(job, modelInput); HadoopUtil.delete(conf, modelOutput); if (!job.waitForCompletion(true)) { throw new InterruptedException( String.format("Failed to complete iteration %d stage 1", iterationNumber)); } }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format// w w w . j a va2 s . com * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(DictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }