List of usage examples for org.apache.hadoop.mapred JobConf setCombinerClass
public void setCombinerClass(Class<? extends Reducer> theClass)
From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class); conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output, "trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); // conf.setNumReduceTasks(1); conf.setMapperClass(CBayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(CBayesThetaNormalizerReducer.class); conf.setReducerClass(CBayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); HadoopUtil.overwriteOutput(outPath); Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*"); double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); log.info("{}", retSigmaJSigmaK); conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString); Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); conf.set("bayes.parameters", params.toString()); conf.set("output.table", output.toString()); client.setConf(conf);/* w w w . j a v a 2 s . co m*/ JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesFeatureDriver.class); conf.setJobName("Bayes Feature Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); conf.setPartitionerClass(FeaturePartitioner.class); conf.setOutputKeyComparatorClass(FeatureLabelComparator.class); FileInputFormat.setInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, output); conf.setMapperClass(BayesFeatureMapper.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setCombinerClass(BayesFeatureCombiner.class); conf.setReducerClass(BayesFeatureReducer.class); conf.setOutputFormat(BayesFeatureOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // this conf parameter needs to be set enable serialisation of conf values HadoopUtil.overwriteOutput(output);// ww w. j ava2s .c o m conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesWeightSummerDriver.class); conf.setJobName("TfIdf Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount")); Path outPath = new Path(output, "trainer-tfIdf"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); conf.setJarByClass(BayesTfIdfDriver.class); conf.setMapperClass(BayesTfIdfMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesTfIdfReducer.class); conf.setReducerClass(BayesTfIdfReducer.class); conf.setOutputFormat(BayesTfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); HadoopUtil.overwriteOutput(outPath); Path interimFile = new Path(output, "trainer-docCount/part-*"); Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile, conf);/*w ww. j av a 2s.co m*/ DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); log.info("Counts of documents in Each Label"); Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString); log.info("{}", c); conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString); log.info(params.print()); if (params.get("dataSource").equals("hbase")) { String tableName = output.toString(); HBaseConfiguration hc = new HBaseConfiguration(new Configuration()); HTableDescriptor ht = new HTableDescriptor(tableName); HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY + ':'); hcd.setBloomfilter(true); hcd.setInMemory(true); hcd.setMaxVersions(1); hcd.setBlockCacheEnabled(true); ht.addFamily(hcd); log.info("Connecting to hbase..."); HBaseAdmin hba = new HBaseAdmin(hc); log.info("Creating Table {}", output); if (hba.tableExists(tableName)) { hba.disableTable(tableName); hba.deleteTable(tableName); hba.majorCompact(".META."); } hba.createTable(ht); conf.set("output.table", tableName); } conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesWeightSummerDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesWeightSummerDriver.class); conf.setJobName("Bayes Weight Summer Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output, "trainer-weights"); FileOutputFormat.setOutputPath(conf, outPath); HadoopUtil.overwriteOutput(outPath); // conf.setNumReduceTasks(1); // conf.setNumMapTasks(100); conf.setMapperClass(BayesWeightSummerMapper.class); // see the javadoc for the spec for file input formats: first token is key, // rest is input. Whole document on one line conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesWeightSummerReducer.class); conf.setReducerClass(BayesWeightSummerReducer.class); conf.setOutputFormat(BayesWeightSummerOutputFormat.class); conf.set("bayes.parameters", params.toString()); conf.set("output.table", output.toString()); client.setConf(conf);/*from w ww . ja v a 2 s . com*/ JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.cbayes.CBayesNormalizedWeightDriver.java
License:Apache License
/** * Run the job//w ww .j a v a 2s .c o m * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-theta")); Path outPath = new Path(output + "/trainer-weight"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(CBayesNormalizedWeightMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(CBayesNormalizedWeightReducer.class); conf.setReducerClass(CBayesNormalizedWeightReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path thetaNormalizationsFiles = new Path(output + "/trainer-thetaNormalizer/part*"); Map<String, Double> thetaNormalizer = SequenceFileModelReader.readLabelSums(dfs, thetaNormalizationsFiles, conf); double perLabelWeightSumNormalisationFactor = Double.MAX_VALUE; for (Map.Entry<String, Double> stringDoubleEntry1 : thetaNormalizer.entrySet()) { double Sigma_W_ij = stringDoubleEntry1.getValue(); if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) { perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij); } } for (Map.Entry<String, Double> stringDoubleEntry : thetaNormalizer.entrySet()) { double Sigma_W_ij = stringDoubleEntry.getValue(); thetaNormalizer.put(stringDoubleEntry.getKey(), Sigma_W_ij / perLabelWeightSumNormalisationFactor); } DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(thetaNormalizer)); String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer); Map<String, Double> c = mapStringifier.fromString(thetaNormalizationsString); log.info("{}", c); conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.cbayes.CBayesThetaNormalizerDriver.java
License:Apache License
/** * Run the job//from www . ja v a 2 s .com * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-weights/Sigma_j")); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output + "/trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(CBayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(CBayesThetaNormalizerReducer.class); conf.setReducerClass(CBayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path Sigma_kFiles = new Path(output + "/trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, Sigma_kFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigma_kSigma_jFile = new Path(output + "/trainer-weights/Sigma_kSigma_j/*"); double sigma_jSigma_k = SequenceFileModelReader.readSigma_jSigma_k(dfs, sigma_kSigma_jFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigma_jSigma_kString = stringifier.toString(sigma_jSigma_k); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigma_jSigma_k = stringifier.fromString(sigma_jSigma_kString); log.info("{}", retSigma_jSigma_k); conf.set("cnaivebayes.sigma_jSigma_k", sigma_jSigma_kString); Path vocabCountFile = new Path(output + "/trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.math.hadoop.MatrixMultiplicationJob.java
License:Apache License
public static Configuration createMatrixMultiplyJobConf(Configuration initialConf, Path aPath, Path bPath, Path outPath, int outCardinality) { JobConf conf = new JobConf(initialConf, MatrixMultiplicationJob.class); conf.setInputFormat(CompositeInputFormat.class); conf.set("mapred.join.expr", CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath)); conf.setInt(OUT_CARD, outCardinality); conf.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(conf, outPath); conf.setMapperClass(MatrixMultiplyMapper.class); conf.setCombinerClass(MatrixMultiplicationReducer.class); conf.setReducerClass(MatrixMultiplicationReducer.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); return conf;//from ww w.j a va 2 s . c o m }
From source file:org.apache.nutch.crawl.CrawlDbReader.java
License:Apache License
public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); }/* w w w .j a v a2s. c om*/ Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); job.setJobName("stats " + crawlDb); job.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatCombiner.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // https://issues.apache.org/jira/browse/NUTCH-1029 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); JobClient.runJob(job); // reading the result FileSystem fileSystem = FileSystem.get(config); SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder); Text key = new Text(); LongWritable value = new LongWritable(); TreeMap<String, LongWritable> stats = new TreeMap<String, LongWritable>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); LongWritable val = stats.get(k); if (val == null) { val = new LongWritable(); if (k.equals("scx")) val.set(Long.MIN_VALUE); if (k.equals("scn")) val.set(Long.MAX_VALUE); stats.put(k, val); } if (k.equals("scx")) { if (val.get() < value.get()) val.set(value.get()); } else if (k.equals("scn")) { if (val.get() > value.get()) val.set(value.get()); } else { val.set(val.get() + value.get()); } } reader.close(); } if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = stats.get("T"); stats.remove("T"); LOG.info("TOTAL urls:\t" + totalCnt.get()); for (Map.Entry<String, LongWritable> entry : stats.entrySet()) { String k = entry.getKey(); LongWritable val = entry.getValue(); if (k.equals("scn")) { LOG.info("min score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("scx")) { LOG.info("max score:\t" + (float) (val.get() / 1000.0f)); } else if (k.equals("sct")) { LOG.info("avg score:\t" + (float) ((((double) val.get()) / totalCnt.get()) / 1000.0)); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) LOG.info(" " + st[2] + " :\t" + val); else LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val); } else LOG.info(k + ":\t" + val); } } // removing the tmp folder fileSystem.delete(tmpFolder, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } }
From source file:org.apache.nutch.crawl.LinkDb.java
License:Apache License
private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(LinkDb.class); job.setCombinerClass(LinkDbMerger.class); // if we don't run the mergeJob, perform normalization/filtering now if (normalize || filter) { try {/*from w w w.j a v a 2 s. com*/ FileSystem fs = FileSystem.get(config); if (!fs.exists(linkDb)) { job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); } } catch (Exception e) { LOG.warn("LinkDb createJob: " + e); } } job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); return job; }
From source file:org.apache.nutch.scoring.webgraph.LinkRank.java
License:Apache License
/** * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * /*from w w w .j ava 2s .com*/ * @param fs The job file system. * @param webGraphDb The web graph database to use. * * @return The number of nodes in the web graph. * @throws IOException If an error occurs while running the counter job. */ private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); JobConf counter = new NutchJob(getConf()); counter.setJobName("LinkRank Counter"); FileInputFormat.addInputPath(counter, nodeDb); FileOutputFormat.setOutputPath(counter, numLinksPath); counter.setInputFormat(SequenceFileInputFormat.class); counter.setMapperClass(Counter.class); counter.setCombinerClass(Counter.class); counter.setReducerClass(Counter.class); counter.setMapOutputKeyClass(Text.class); counter.setMapOutputValueClass(LongWritable.class); counter.setOutputKeyClass(Text.class); counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } LOG.info("Finished link counter job"); // read the first (and only) line from the file which should be the // number of links in the web graph LOG.info("Reading numlinks temp file"); FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000")); BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); String numLinks = numLinksLine.split("\\s+")[1]; return Integer.parseInt(numLinks); }