List of usage examples for org.apache.hadoop.mapred JobConf setCombinerClass
public void setCombinerClass(Class<? extends Reducer> theClass)
From source file:org.apache.avro.mapred.TestWordCountSpecific.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); try {/* w w w.j a v a 2s . co m*/ WordCountUtil.writeLinesFile(); job.setJobName("wordcount"); AvroJob.setInputSpecific(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSpecific(job, WordCount.SCHEMA$); job.setMapperClass(MapImpl.class); job.setCombinerClass(ReduceImpl.class); job.setReducerClass(ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); JobClient.runJob(job); WordCountUtil.validateCountsFile(); } finally { outputPath.getFileSystem(job).delete(outputPath); } }
From source file:org.apache.ignite.internal.processors.hadoop.examples.GridHadoopWordCount1.java
License:Apache License
/** * Sets task classes with related info if needed into configuration object. * * @param jobConf Configuration to change. * @param setMapper Option to set mapper and input format classes. * @param setCombiner Option to set combiner class. * @param setReducer Option to set reducer and output format classes. *///from w w w .j av a 2 s . c o m public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner, boolean setReducer) { if (setMapper) { jobConf.setMapperClass(GridHadoopWordCount1Map.class); jobConf.setInputFormat(TextInputFormat.class); } if (setCombiner) jobConf.setCombinerClass(GridHadoopWordCount1Reduce.class); if (setReducer) { jobConf.setReducerClass(GridHadoopWordCount1Reduce.class); jobConf.setOutputFormat(TextOutputFormat.class); } }
From source file:org.apache.ignite.internal.processors.hadoop.examples.HadoopWordCount1.java
License:Apache License
/** * Sets task classes with related info if needed into configuration object. * * @param jobConf Configuration to change. * @param setMapper Option to set mapper and input format classes. * @param setCombiner Option to set combiner class. * @param setReducer Option to set reducer and output format classes. *///from w w w.j ava2 s. c om public static void setTasksClasses(JobConf jobConf, boolean setMapper, boolean setCombiner, boolean setReducer) { if (setMapper) { jobConf.setMapperClass(HadoopWordCount1Map.class); jobConf.setInputFormat(TextInputFormat.class); } if (setCombiner) jobConf.setCombinerClass(HadoopWordCount1Reduce.class); if (setReducer) { jobConf.setReducerClass(HadoopWordCount1Reduce.class); jobConf.setOutputFormat(TextOutputFormat.class); } }
From source file:org.apache.mahout.avro.text.mapred.AvroDocumentsWordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(); if (args.length != 2) { System.err.println("Usage: wordcount <in> <out>"); return 0; }//from w w w. j a v a2 s. c o m conf.setStrings("io.serializations", new String[] { WritableSerialization.class.getName(), AvroSpecificSerialization.class.getName(), AvroReflectSerialization.class.getName(), AvroGenericSerialization.class.getName() }); conf.setJarByClass(AvroDocumentsWordCount.class); conf.setMapperClass(TokenizerMapper.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setInputFormat(AvroInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); Path input = new Path(args[0]); Path output = new Path(args[1]); FileSystem fs = FileSystem.get(conf); fs.delete(output, true); AvroInputFormat.setAvroInputClass(conf, AvroDocument.class); FileInputFormat.addInputPath(conf, input); FileOutputFormat.setOutputPath(conf, output); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); return job.isSuccessful() ? 1 : 0; }
From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerDriver.java
License:Apache License
/** * Run the job/*from w w w . jav a 2s . co m*/ * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesThetaNormalizerDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output + "/trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(BayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesThetaNormalizerReducer.class); conf.setReducerClass(BayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*"); double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); log.info("{}", retSigma_jSigma_k); conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureDriver.java
License:Apache License
/** * Run the job// w w w. ja v a 2s . c o m * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output, int gramSize) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesFeatureDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(BayesFeatureMapper.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setCombinerClass(BayesFeatureReducer.class); conf.setReducerClass(BayesFeatureReducer.class); conf.setOutputFormat(BayesFeatureOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } DefaultStringifier<Integer> intStringifier = new DefaultStringifier<Integer>(conf, Integer.class); String gramSizeString = intStringifier.toString(gramSize); log.info("{}", intStringifier.fromString(gramSizeString)); conf.set("bayes.gramSize", gramSizeString); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfDriver.java
License:Apache License
/** * Run the job/*from www . j a v a 2s . c om*/ * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesTfIdfDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-termDocCount")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-wordFreq")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-featureCount")); Path outPath = new Path(output + "/trainer-tfIdf"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); conf.setMapperClass(BayesTfIdfMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesTfIdfReducer.class); conf.setReducerClass(BayesTfIdfReducer.class); conf.setOutputFormat(BayesTfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path interimFile = new Path(output + "/trainer-docCount/part-*"); Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); log.info("Counts of documents in Each Label"); Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString); log.info("{}", c); conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.common.BayesWeightSummerDriver.java
License:Apache License
/** * Run the job// ww w .ja v a 2 s .co m * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(BayesWeightSummerDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output + "/trainer-weights"); FileOutputFormat.setOutputPath(conf, outPath); //conf.setNumReduceTasks(1); conf.setNumMapTasks(100); conf.setMapperClass(BayesWeightSummerMapper.class); //see the javadoc for the spec for file input formats: first token is key, rest is input. Whole document on one line conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesWeightSummerReducer.class); conf.setReducerClass(BayesWeightSummerReducer.class); conf.setOutputFormat(BayesWeightSummerOutputFormat.class); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java
License:Apache License
/** * Run the job//from w w w.j av a 2 s .co m * * @param params * The Job parameters containing the gramSize, input output folders, defaultCat, encoding */ public static void runJob(Parameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesClassifierDriver.class); conf.setJobName("Bayes Classifier Driver running over input: " + params.get("testDirPath")); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(conf, new Path(params.get("testDirPath"))); Path outPath = new Path(params.get("testDirPath") + "-output"); FileOutputFormat.setOutputPath(conf, outPath); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setMapperClass(BayesClassifierMapper.class); conf.setCombinerClass(BayesClassifierReducer.class); conf.setReducerClass(BayesClassifierReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); HadoopUtil.overwriteOutput(outPath); conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); Path outputFiles = new Path(outPath, "part*"); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); ConfusionMatrix matrix = readResult(dfs, outputFiles, conf, params); log.info("{}", matrix.summarize()); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesThetaNormalizerDriver.class); conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output, "trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); // conf.setNumReduceTasks(1); conf.setMapperClass(BayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesThetaNormalizerReducer.class); conf.setReducerClass(BayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code HadoopUtil.overwriteOutput(outPath); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*"); double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); log.info("{}", retSigmaJSigmaK); conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString); Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); conf.set("bayes.parameters", params.toString()); conf.set("output.table", output.toString()); client.setConf(conf);//from w w w . j a va 2 s . co m JobClient.runJob(conf); }