List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CachingCVB0PerplexityMapper.class); job.setMapperClass(CachingCVB0PerplexityMapper.class); job.setCombinerClass(DualDoubleSumReducer.class); job.setReducerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1); job.setOutputKeyClass(DoubleWritable.class); job.setOutputValueClass(DoubleWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusPath); Path outputPath = perplexityPath(modelPath.getParent(), iteration); FileOutputFormat.setOutputPath(job, outputPath); setModelPaths(job, modelPath);/*w w w . j av a 2s .com*/ HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeTopicModel(Configuration conf, Path modelInput, Path output) throws IOException, InterruptedException, ClassNotFoundException { String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, modelInput); FileOutputFormat.setOutputPath(job, output); job.submit();//from w w w . j a va 2s . c o m return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); }//from w w w . j a v a 2 s .c o m DistributedCache.setCacheFiles(modelUris, conf); } setModelPaths(job, modelInput);//bug:mahout-1147 FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput, int iterationNumber, int maxIterations, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations, modelInput);//from w w w . ja v a2s .c om log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setJarByClass(CVB0Driver.class); job.setMapperClass(CachingCVB0Mapper.class); job.setCombinerClass(VectorSumReducer.class); job.setReducerClass(VectorSumReducer.class); job.setNumReduceTasks(numReduceTasks); job.setOutputKeyClass(Text.class);//0.7IntWritable job.setOutputValueClass(VectorWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, corpusInput); FileOutputFormat.setOutputPath(job, modelOutput); setModelPaths(job, modelInput); HadoopUtil.delete(conf, modelOutput); if (!job.waitForCompletion(true)) { throw new InterruptedException( String.format("Failed to complete iteration %d stage 1", iterationNumber)); } }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format//from w w w . j a va 2s.c om * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(DictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/*ww w . j av a 2 s .c om*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(FixDictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elixir.hadoop.Chromo.FragmentCoverage.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);/*from w w w . j av a2 s.c o m*/ } Job job = Job.getInstance(conf, "position"); job.setJarByClass(FragmentCoverage.class); job.setMapperClass(CoverageMapper.class); job.setCombinerClass(IntSumReducer.class); job.setNumReduceTasks(5); job.setMapOutputKeyClass(com.elixir.hadoop.Chromo.SecondrySort.IntPair.class); //job.setSpeculativeExecution(true); job.setPartitionerClass(ChromoPartitioner.class); job.setGroupingComparatorClass(com.elixir.hadoop.Chromo.SecondrySort.FirstGroupingComparator.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // job.setOutputFormatClass(Text.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.example.bigtable.sample.CellCounter.java
License:Apache License
/** * Sets up the actual job./*from w ww .j av a2 s. c om*/ * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; Path outputDir = new Path(args[1]); String reportSeparatorString = (args.length > 2) ? args[2] : ":"; conf.set("ReportSeparator", reportSeparatorString); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(CellCounter.class); Scan scan = getConfiguredScanForJob(conf, args); TableMapReduceUtil.initTableMapperJob(tableName, scan, CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job); job.setNumReduceTasks(1); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, outputDir); job.setReducerClass(IntSumReducer.class); return job; }
From source file:com.examples.ch03.ParseWeblogs_Ex_1.java
public int run(String[] args) throws Exception { Path inputPath = new Path("apache_clf.txt"); Path outputPath = new Path("output"); Configuration conf = getConf(); Job weblogJob = Job.getInstance(conf); weblogJob.setJobName("Weblog Transformer"); weblogJob.setJarByClass(getClass()); weblogJob.setNumReduceTasks(0); weblogJob.setMapperClass(CLFMapper_Ex_1.class); weblogJob.setMapOutputKeyClass(Text.class); weblogJob.setMapOutputValueClass(Text.class); weblogJob.setOutputKeyClass(Text.class); weblogJob.setOutputValueClass(Text.class); weblogJob.setInputFormatClass(TextInputFormat.class); weblogJob.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(weblogJob, inputPath); FileOutputFormat.setOutputPath(weblogJob, outputPath); if (weblogJob.waitForCompletion(true)) { return 0; }/*from w ww . ja v a2s .c om*/ return 1; }
From source file:com.facebook.hiveio.mapreduce.output.WritingTool.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); handleCommandLine(args, conf);//from w ww. j a v a 2 s . c om HadoopUtils.setMapAttempts(conf, 1); adjustConfigurationForHive(conf); HiveTools.setupJob(conf); Job job = new Job(conf, "hive-io-writing"); if (job.getJar() == null) { job.setJarByClass(getClass()); } job.setMapperClass(SampleMapper.class); job.setInputFormatClass(SampleInputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(HiveWritableRecord.class); job.setOutputFormatClass(SampleOutputFormat.class); job.setNumReduceTasks(0); job.submit(); return job.waitForCompletion(true) ? 0 : 1; }