List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java
License:Apache License
/** * for hadoop .1.9.//from w w w . ja va 2 s . com * * Gets a job driver. * * @param jobClass * @param jobName * @param numReducers * @param inputPath * @param outputPath * @param inputFormat * @param outputFormat * @param mapper * @param combiner * @param reducer * @param outputkey * @param outputValue * @return * @throws java.io.IOException * @deprecated to be removed soon */ public static Job getJobDriver(Class<?> jobClass, String jobName, int numReducers, String inputPath, String outputPath, Class<? extends InputFormat> inputFormat, Class<? extends OutputFormat> outputFormat, Class<? extends Mapper> mapper, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer, Class<? extends WritableComparable> outputkey, Class<? extends Writable> outputValue) throws IOException { Job job = new Job(new Configuration()); // job job.setJarByClass(jobClass); job.setJobName(jobName); // number reducers if (numReducers > -1) { job.setNumReduceTasks(numReducers); } // file path if (null != inputPath) { FileInputFormat.addInputPaths(job, inputPath); } if (null != outputPath) { FileOutputFormat.setOutputPath(job, new Path(outputPath)); } // file format if (null != inputFormat) { job.setInputFormatClass(inputFormat); } if (null != outputFormat) { job.setOutputFormatClass(outputFormat); } // mapper, combiner, redcuer, partitioner if (null != mapper) { job.setMapperClass(mapper); } if (null != combiner) { job.setCombinerClass(combiner); } if (null != reducer) { job.setReducerClass(reducer); } // map, output key and value class if (null != outputkey) { job.setMapOutputKeyClass(outputkey); } if (null != outputValue) { job.setMapOutputValueClass(outputValue); } // output key and value class return job; }
From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java
License:Apache License
/** * Sets the static parameters (input, output, map, combine, reduce) related to * a job./* w ww .j a v a2s.c o m*/ * * @param job * @param jobClass * @param jobName * @param inputFormat * @param outputFormat * @param mapper * @param combiner * @param reducer * @param mapOutputKey * @param mapOutputValue * @throws java.io.IOException */ public static void setJobStaticParameters(Job job, Class<?> jobClass, String jobName, Class<? extends InputFormat> inputFormat, Class<? extends OutputFormat> outputFormat, Class<? extends Mapper> mapper, Class<? extends Reducer> combiner, Class<? extends Reducer> reducer, Class<? extends WritableComparable> mapOutputKey, Class<? extends Writable> mapOutputValue) throws IOException { // job (name and class) job.setJobName(jobName); job.setJarByClass(jobClass); // format (input and output) if (null != inputFormat) { job.setInputFormatClass(inputFormat); } if (null != outputFormat) { job.setOutputFormatClass(outputFormat); } // mapper if (null != mapper) { job.setMapperClass(mapper); } if (null != mapOutputKey) { job.setMapOutputKeyClass(mapOutputKey); } if (null != mapOutputValue) { job.setMapOutputValueClass(mapOutputValue); } // combiner if (null != combiner) { job.setCombinerClass(combiner); } // reducer if (null != reducer) { job.setReducerClass(reducer); } // job.setOutputKeyClass(jobClass); // partitioner }
From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java
License:Apache License
private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: {}", jobName); Path outputPath = perplexityPath(modelPath.getParent(), iteration); Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class, DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class); job.setJobName(jobName);//from w w w .j a va 2 s .c om job.setCombinerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1); setModelPaths(job, modelPath); HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
From source file:org.apache.mahout.clustering.lda.cvb.CVB0Driver.java
License:Apache License
public void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput, int iterationNumber, int maxIterations, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations, modelInput);//from ww w .j a v a 2 s .c om log.info("About to run: {}", jobName); Job job = prepareJob(corpusInput, modelOutput, CachingCVB0Mapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class); job.setCombinerClass(VectorSumReducer.class); job.setNumReduceTasks(numReduceTasks); job.setJobName(jobName); setModelPaths(job, modelInput); HadoopUtil.delete(conf, modelOutput); if (!job.waitForCompletion(true)) { throw new InterruptedException( String.format("Failed to complete iteration %d stage 1", iterationNumber)); } }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
/** * Run the job using supplied arguments/*ww w .j a v a 2 s .c o m*/ * * @param input * the directory pathname for input points * @param stateIn * the directory pathname for input state * @param stateOut * the directory pathname for output state * @param numTopics * the number of clusters * @param numReducers * the number of Reducers desired */ private double runIteration(Path input, Path stateIn, Path stateOut, int numTopics, int numWords, double topicSmoothing, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set(STATE_IN_KEY, stateIn.toString()); conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics)); conf.set(NUM_WORDS_KEY, Integer.toString(numWords)); conf.set(TOPIC_SMOOTHING_KEY, Double.toString(topicSmoothing)); Job job = new Job(conf); job.setOutputKeyClass(IntPairWritable.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPaths(job, input.toString()); FileOutputFormat.setOutputPath(job, stateOut); job.setMapperClass(LDAMapper.class); job.setReducerClass(LDAReducer.class); job.setCombinerClass(LDAReducer.class); job.setNumReduceTasks(numReducers); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setJarByClass(LDADriver.class); job.waitForCompletion(true); return findLL(stateOut, conf); }
From source file:org.apache.mahout.clustering.spectral.eigencuts.EigencutsAffinityCutsJob.java
License:Apache License
/** * Runs a single iteration of defining cluster boundaries, based on * previous calculations and the formation of the "cut matrix". * /* w w w. j a va 2s . c o m*/ * @param currentAffinity Path to the current affinity matrix. * @param cutMatrix Path to the sensitivity matrix. * @param nextAffinity Output path for the new affinity matrix. */ public static long runjob(Path currentAffinity, Path cutMatrix, Path nextAffinity, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { // these options allow us to differentiate between the two vectors // in the mapper and reducer - we'll know from the working path // which SequenceFile we're accessing conf.set(EigencutsKeys.AFFINITY_PATH, currentAffinity.getName()); conf.set(EigencutsKeys.CUTMATRIX_PATH, cutMatrix.getName()); Job job = new Job(conf, "EigencutsAffinityCutsJob"); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VertexWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(EigencutsAffinityCutsMapper.class); job.setCombinerClass(EigencutsAffinityCutsCombiner.class); job.setReducerClass(EigencutsAffinityCutsReducer.class); //FileInputFormat.addInputPath(job, currentAffinity); FileInputFormat.addInputPath(job, cutMatrix); FileOutputFormat.setOutputPath(job, nextAffinity); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CUTSCOUNTER.NUM_CUTS).getValue(); }
From source file:org.apache.mahout.feature.mrmr.MRMRDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w . ja v a2s . c om*/ addOutputOption(); addOption(DefaultOptionCreator.targetColumnOption().create()); addOption(DefaultOptionCreator.rowNumberOption().create()); addOption(DefaultOptionCreator.columnNumberOption().create()); addOption(DefaultOptionCreator.featureNumberOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); Path input = getInputPath(); Path output = getOutputPath(); Path temp = getTempPath(); int targetColumn = Integer.parseInt(getOption(DefaultOptionCreator.TARGET_COLUMN)); int rowNumber = Integer.parseInt(getOption(DefaultOptionCreator.ROW_NUMBER)); int columnNumber = Integer.parseInt(getOption(DefaultOptionCreator.COLUMN_NUMBER)); int featureNumber = Integer.parseInt(getOption(DefaultOptionCreator.FEATURE_NUMBER)); log.info("Feature selection algorithm: MRMR"); Path tempMax = null; for (int i = 0; i < featureNumber; i++) { log.info("Generating candidates at stage " + i + "th"); Path tempFeature = new Path(featureJobTempUri(temp, i)); Configuration confFeature = getConf(); confFeature.set(DefaultOptionCreator.TARGET_INDEX, "" + (targetColumn - 1)); confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber); confFeature.set(DefaultOptionCreator.ROW_NUMBER, "" + rowNumber); Job jobFeature = HadoopUtil.prepareJob(input, tempFeature, TextInputFormat.class, MRMRMapper.class, IntWritable.class, Text.class, MRMRReducer.class, LongWritable.class, Text.class, SequenceFileOutputFormat.class, confFeature); jobFeature.setJobName("Feature Candidate stage " + i + "th"); if (i > 0) { DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(), jobFeature.getConfiguration()); } boolean succeededFeature = jobFeature.waitForCompletion(true); if (!succeededFeature) return -1; // Selecting best candidate job log.info("Selecting the best candidate at stage " + i + "th"); Configuration confMax = getConf(); confFeature.set(DefaultOptionCreator.COLUMN_NUMBER, "" + columnNumber); tempMax = new Path(maxJobTempUri(temp, i)); if (i == featureNumber - 1) tempMax = outputPath; Job jobMax = HadoopUtil.prepareJob(tempFeature, tempMax, SequenceFileInputFormat.class, Mapper.class, LongWritable.class, Text.class, MaxReducer.class, LongWritable.class, Text.class, TextOutputFormat.class, confMax); jobMax.setJobName("Best candidate at stage " + i + "th"); jobMax.setCombinerClass(MaxCombiner.class); if (i > 0) { DistributedCache.addCacheFile(new Path(cacheFileUri(temp, i)).toUri(), jobMax.getConfiguration()); } boolean succeededMax = jobMax.waitForCompletion(true); if (!succeededMax) return -1; try { FileSystem hdfs = FileSystem.get(confMax); hdfs.delete(tempFeature, true); } catch (IOException e) { } } return 0; }
From source file:org.apache.mahout.fpm.pfpgrowth.dataset.KeyBasedStringTupleGrouper.java
License:Apache License
public static void startJob(Parameters params) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("job.parameters", params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); String input = params.get("input"); Job job = new Job(conf, "Generating dataset based from input" + input); job.setJarByClass(KeyBasedStringTupleGrouper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get("output")); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath);//from www . java 2 s . c om job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(KeyBasedStringTupleMapper.class); job.setCombinerClass(KeyBasedStringTupleCombiner.class); job.setReducerClass(KeyBasedStringTupleReducer.class); job.setOutputFormatClass(TextOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the features * present in it and thus calculate the final Top K frequent Patterns for each feature * //from w ww. ja v a 2 s. c om * @param params * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void startAggregating(Parameters params) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); params.set("fList", ""); params.set("gList", ""); conf.set("pfp.parameters", params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get("output") + "/fpgrowth"; Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TopKStringPatterns.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get("output"), "frequentPatterns"); FileOutputFormat.setOutputPath(job, outPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(AggregatorMapper.class); job.setCombinerClass(AggregatorReducer.class); job.setReducerClass(AggregatorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.overwriteOutput(outPath); job.waitForCompletion(true); }
From source file:org.apache.mahout.fpm.pfpgrowth.PFPGrowth.java
License:Apache License
/** * Count the frequencies of various features in parallel using Map/Reduce * // w w w . ja va 2 s .co m * @param params * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void startParallelCounting(Parameters params) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("pfp.parameters", params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get("input"); Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get("output"), "parallelcounting"); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.overwriteOutput(outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); }