List of usage examples for org.apache.hadoop.mapreduce Job setMapperClass
public void setMapperClass(Class<? extends Mapper> cls) throws IllegalStateException
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeTopicModel(Configuration conf, Path modelInput, Path output) throws IOException, InterruptedException, ClassNotFoundException { String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class); job.setNumReduceTasks(0);//w w w. j a va2 s. c o m job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, modelInput); FileOutputFormat.setOutputPath(job, output); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0);/* www . j a v a2 s . co m*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); } DistributedCache.setCacheFiles(modelUris, conf); } setModelPaths(job, modelInput);//bug:mahout-1147 FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput, int iterationNumber, int maxIterations, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations, modelInput);/*from ww w. j av a 2 s .c o m*/ log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setMapperClass(CachingCVB0Mapper.class); job.setCombinerClass(VectorSumReducer.class); job.setReducerClass(VectorSumReducer.class); job.setNumReduceTasks(numReduceTasks); job.setOutputKeyClass(Text.class);//0.7IntWritable job.setOutputValueClass(VectorWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusInput); FileOutputFormat.setOutputPath(job, modelOutput); setModelPaths(job, modelInput); HadoopUtil.delete(conf, modelOutput); if (!job.waitForCompletion(true)) { throw new InterruptedException( String.format("Failed to complete iteration %d stage 1", iterationNumber)); } }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/*from w w w . j ava 2 s.c o m*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(DictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in * {@link SequenceFile} format//w ww . j a va 2s. com */ private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(MIN_SUPPORT, minSupport); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input); job.setJarByClass(DictionaryVectorizer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermCountCombiner.class); job.setReducerClass(TermCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/*w ww. j av a 2 s. co m*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(FixDictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in * {@link SequenceFile} format//from www .jav a 2 s .c o m */ private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(MIN_SUPPORT, minSupport); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input); job.setJarByClass(FixDictionaryVectorizer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermCountCombiner.class); job.setReducerClass(TermCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elixir.hadoop.Chromo.FragmentCoverage.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);/*from w ww . ja v a 2s.c o m*/ } Job job = Job.getInstance(conf, "position"); job.setJarByClass(FragmentCoverage.class); job.setMapperClass(CoverageMapper.class); job.setCombinerClass(IntSumReducer.class); job.setNumReduceTasks(5); job.setMapOutputKeyClass(com.elixir.hadoop.Chromo.SecondrySort.IntPair.class); //job.setSpeculativeExecution(true); job.setPartitionerClass(ChromoPartitioner.class); job.setGroupingComparatorClass(com.elixir.hadoop.Chromo.SecondrySort.FirstGroupingComparator.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.setOutputFormatClass(Text.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.elixir.hadoop.FragmentCoverage.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);//from w w w. j av a2 s. co m } Job job = Job.getInstance(conf, "position"); job.setJarByClass(FragmentCoverage.class); job.setMapperClass(CoverageMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.elixir.hadoop.OddEven.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);/*from w w w . j a v a2 s. c o m*/ } Job job = Job.getInstance(conf, "oddeven"); job.setJarByClass(OddEven.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }