List of usage examples for org.apache.hadoop.io DefaultStringifier toString
@Override public String toString(T obj) throws IOException
From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {/*from w ww.j a va 2 s. c om*/ if (labelDocumentCounts == null) { labelDocumentCounts = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", labelDocumentCountString); labelDocumentCounts = mapStringifier.fromString(labelDocumentCountString); } } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesThetaNormalizerDriver.class); conf.setJobName("Bayes Theta Normalizer Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output, "trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); // conf.setNumReduceTasks(1); conf.setMapperClass(BayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesThetaNormalizerReducer.class); conf.setReducerClass(BayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code HadoopUtil.overwriteOutput(outPath); FileSystem dfs = FileSystem.get(outPath.toUri(), conf); Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigmaJSigmaKFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*"); double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaJSigmaKFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); log.info("{}", retSigmaJSigmaK); conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString); Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); conf.set("bayes.parameters", params.toString()); conf.set("output.table", output.toString()); client.setConf(conf);//from w w w . j a v a 2s. co m JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {//from w ww . jav a 2s . c om labelWeightSum.clear(); Map<String, Double> labelWeightSumTemp = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelWeightSumTemp)); String labelWeightSumString = job.get("cnaivebayes.sigma_k", mapStringifier.toString(labelWeightSumTemp)); labelWeightSumTemp = mapStringifier.fromString(labelWeightSumString); for (Map.Entry<String, Double> stringDoubleEntry : labelWeightSumTemp.entrySet()) { this.labelWeightSum.put(stringDoubleEntry.getKey(), stringDoubleEntry.getValue()); } DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job, GenericsUtil.getClass(sigmaJSigmaK)); String sigmaJSigmaKString = job.get("cnaivebayes.sigma_jSigma_k", stringifier.toString(sigmaJSigmaK)); sigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); String vocabCountString = stringifier.toString(vocabCount); vocabCountString = job.get("cnaivebayes.vocabCount", vocabCountString); vocabCount = stringifier.fromString(vocabCountString); Parameters params = Parameters.fromString(job.get("bayes.parameters", "")); alphaI = Double.valueOf(params.get("alpha_i", "1.0")); } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(CBayesThetaNormalizerDriver.class); conf.setJobName("Complementary Bayes Theta Normalizer Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-weights/Sigma_j")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-tfIdf/trainer-tfIdf")); Path outPath = new Path(output, "trainer-thetaNormalizer"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); // conf.setNumReduceTasks(1); conf.setMapperClass(CBayesThetaNormalizerMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(CBayesThetaNormalizerReducer.class); conf.setReducerClass(CBayesThetaNormalizerReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); HadoopUtil.overwriteOutput(outPath); Path sigmaKFiles = new Path(output, "trainer-weights/Sigma_k/*"); Map<String, Double> labelWeightSum = SequenceFileModelReader.readLabelSums(dfs, sigmaKFiles, conf); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelWeightSum)); String labelWeightSumString = mapStringifier.toString(labelWeightSum); log.info("Sigma_k for Each Label"); Map<String, Double> c = mapStringifier.fromString(labelWeightSumString); log.info("{}", c); conf.set("cnaivebayes.sigma_k", labelWeightSumString); Path sigmaKSigmaJFile = new Path(output, "trainer-weights/Sigma_kSigma_j/*"); double sigmaJSigmaK = SequenceFileModelReader.readSigmaJSigmaK(dfs, sigmaKSigmaJFile, conf); DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(conf, Double.class); String sigmaJSigmaKString = stringifier.toString(sigmaJSigmaK); log.info("Sigma_kSigma_j for each Label and for each Features"); double retSigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); log.info("{}", retSigmaJSigmaK); conf.set("cnaivebayes.sigma_jSigma_k", sigmaJSigmaKString); Path vocabCountFile = new Path(output, "trainer-tfIdf/trainer-vocabCount/*"); double vocabCount = SequenceFileModelReader.readVocabCount(dfs, vocabCountFile, conf); String vocabCountString = stringifier.toString(vocabCount); log.info("Vocabulary Count"); conf.set("cnaivebayes.vocabCount", vocabCountString); double retvocabCount = stringifier.fromString(vocabCountString); log.info("{}", retvocabCount); conf.set("bayes.parameters", params.toString()); conf.set("output.table", output.toString()); client.setConf(conf);//w w w. ja v a2s . c om JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {//from w w w. j ava 2 s .c o m labelWeightSum.clear(); Map<String, Double> labelWeightSumTemp = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelWeightSumTemp)); String labelWeightSumString = job.get("cnaivebayes.sigma_k", mapStringifier.toString(labelWeightSumTemp)); labelWeightSumTemp = mapStringifier.fromString(labelWeightSumString); for (Map.Entry<String, Double> stringDoubleEntry : labelWeightSumTemp.entrySet()) { this.labelWeightSum.put(stringDoubleEntry.getKey(), stringDoubleEntry.getValue()); } DefaultStringifier<Double> stringifier = new DefaultStringifier<Double>(job, GenericsUtil.getClass(sigmaJSigmaK)); String sigmaJSigmaKString = job.get("cnaivebayes.sigma_jSigma_k", stringifier.toString(sigmaJSigmaK)); sigmaJSigmaK = stringifier.fromString(sigmaJSigmaKString); String vocabCountString = job.get("cnaivebayes.vocabCount", stringifier.toString(vocabCount)); vocabCount = stringifier.fromString(vocabCountString); Parameters params = Parameters.fromString(job.get("bayes.parameters", "")); alphaI = Double.valueOf(params.get("alpha_i", "1.0")); } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfDriver.java
License:Apache License
@Override public void runJob(Path input, Path output, BayesParameters params) throws IOException { Configurable client = new JobClient(); JobConf conf = new JobConf(BayesWeightSummerDriver.class); conf.setJobName("TfIdf Driver running over input: " + input); conf.setOutputKeyClass(StringTuple.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output, "trainer-termDocCount")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-wordFreq")); FileInputFormat.addInputPath(conf, new Path(output, "trainer-featureCount")); Path outPath = new Path(output, "trainer-tfIdf"); FileOutputFormat.setOutputPath(conf, outPath); // conf.setNumMapTasks(100); conf.setJarByClass(BayesTfIdfDriver.class); conf.setMapperClass(BayesTfIdfMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(BayesTfIdfReducer.class); conf.setReducerClass(BayesTfIdfReducer.class); conf.setOutputFormat(BayesTfIdfOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); HadoopUtil.overwriteOutput(outPath); Path interimFile = new Path(output, "trainer-docCount/part-*"); Map<String, Double> labelDocumentCounts = SequenceFileModelReader.readLabelDocumentCounts(dfs, interimFile, conf);/* w ww .jav a 2 s .c o m*/ DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(labelDocumentCounts)); String labelDocumentCountString = mapStringifier.toString(labelDocumentCounts); log.info("Counts of documents in Each Label"); Map<String, Double> c = mapStringifier.fromString(labelDocumentCountString); log.info("{}", c); conf.set("cnaivebayes.labelDocumentCounts", labelDocumentCountString); log.info(params.print()); if (params.get("dataSource").equals("hbase")) { String tableName = output.toString(); HBaseConfiguration hc = new HBaseConfiguration(new Configuration()); HTableDescriptor ht = new HTableDescriptor(tableName); HColumnDescriptor hcd = new HColumnDescriptor(BayesConstants.HBASE_COLUMN_FAMILY + ':'); hcd.setBloomfilter(true); hcd.setInMemory(true); hcd.setMaxVersions(1); hcd.setBlockCacheEnabled(true); ht.addFamily(hcd); log.info("Connecting to hbase..."); HBaseAdmin hba = new HBaseAdmin(hc); log.info("Creating Table {}", output); if (hba.tableExists(tableName)) { hba.disableTable(tableName); hba.deleteTable(tableName); hba.majorCompact(".META."); } hba.createTable(ht); conf.set("output.table", tableName); } conf.set("bayes.parameters", params.toString()); client.setConf(conf); JobClient.runJob(conf); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesTfIdfMapper.java
License:Apache License
@Override public void configure(JobConf job) { try {/*from w w w. j a va 2 s.c o m*/ this.labelDocumentCounts.clear(); Map<String, Double> labelDocCountTemp = new HashMap<String, Double>(); DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>( job, GenericsUtil.getClass(labelDocCountTemp)); String labelDocumentCountString = job.get("cnaivebayes.labelDocumentCounts", mapStringifier.toString(labelDocCountTemp)); labelDocCountTemp = mapStringifier.fromString(labelDocumentCountString); for (Map.Entry<String, Double> stringDoubleEntry : labelDocCountTemp.entrySet()) { this.labelDocumentCounts.put(stringDoubleEntry.getKey(), stringDoubleEntry.getValue()); } } catch (IOException ex) { log.warn(ex.toString(), ex); } }
From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver.java
License:Apache License
/** * Run the job/*from w w w .j a v a 2 s .co m*/ * * @param input * the input pathname String * @param output * the output pathname String * @param catFile * the file containing the Wikipedia categories * @param exactMatchOnly * if true, then the Wikipedia category must match exactly instead of simply containing the * category string * @throws ClassNotFoundException * @throws InterruptedException */ public static void runJob(String input, String output, String catFile, boolean exactMatchOnly, Class<? extends Analyzer> analyzerClass) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("key.value.separator.in.input.line", " "); conf.set("xmlinput.start", "<text xml:space=\"preserve\">"); conf.set("xmlinput.end", "</text>"); conf.setBoolean("exact.match.only", exactMatchOnly); conf.set("analyzer.class", analyzerClass.getName()); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf // parameters can make or break a piece of code Set<String> categories = new HashSet<String>(); for (String line : new FileLineIterable(new File(catFile))) { categories.add(line.trim().toLowerCase(Locale.ENGLISH)); } DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories)); String categoriesStr = setStringifier.toString(categories); conf.set("wikipedia.categories", categoriesStr); Job job = new Job(conf); if (log.isInfoEnabled()) { log.info("Input: {} Out: {} Categories: {}", new Object[] { input, output, catFile }); } job.setJarByClass(WikipediaDatasetCreatorDriver.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WikipediaDatasetCreatorMapper.class); //TODO: job.setNumMapTasks(100); job.setInputFormatClass(XmlInputFormat.class); job.setReducerClass(WikipediaDatasetCreatorReducer.class); job.setOutputFormatClass(WikipediaDatasetCreatorOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(input)); Path outPath = new Path(output); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.overwriteOutput(outPath); job.waitForCompletion(true); }
From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); try {/*from w w w . j a va2s .c o m*/ if (inputCategories == null) { Set<String> newCategories = new HashSet<String>(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = setStringifier.toString(newCategories); categoriesStr = conf.get("wikipedia.categories", categoriesStr); inputCategories = setStringifier.fromString(categoriesStr); } exactMatchOnly = conf.getBoolean("exact.match.only", false); if (analyzer == null) { String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName()); Class<? extends Analyzer> analyzerClass = Class.forName(analyzerStr).asSubclass(Analyzer.class); analyzer = analyzerClass.newInstance(); } } catch (IOException ex) { throw new IllegalStateException(ex); } catch (ClassNotFoundException e) { throw new IllegalStateException(e); } catch (IllegalAccessException e) { throw new IllegalStateException(e); } catch (InstantiationException e) { throw new IllegalStateException(e); } log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}", new Object[] { inputCategories.size(), exactMatchOnly, analyzer.getClass().getName() }); }
From source file:org.apache.mahout.classifier.cbayes.CBayesNormalizedWeightDriver.java
License:Apache License
/** * Run the job/*from ww w. j a va 2 s . co m*/ * * @param input the input pathname String * @param output the output pathname String */ public static void runJob(String input, String output) throws IOException { JobClient client = new JobClient(); JobConf conf = new JobConf(CBayesNormalizedWeightDriver.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(conf, new Path(output + "/trainer-theta")); Path outPath = new Path(output + "/trainer-weight"); FileOutputFormat.setOutputPath(conf, outPath); conf.setNumMapTasks(100); //conf.setNumReduceTasks(1); conf.setMapperClass(CBayesNormalizedWeightMapper.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setCombinerClass(CBayesNormalizedWeightReducer.class); conf.setReducerClass(CBayesNormalizedWeightReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization"); // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code FileSystem dfs = FileSystem.get(outPath.toUri(), conf); if (dfs.exists(outPath)) { dfs.delete(outPath, true); } Path thetaNormalizationsFiles = new Path(output + "/trainer-thetaNormalizer/part*"); Map<String, Double> thetaNormalizer = SequenceFileModelReader.readLabelSums(dfs, thetaNormalizationsFiles, conf); double perLabelWeightSumNormalisationFactor = Double.MAX_VALUE; for (Map.Entry<String, Double> stringDoubleEntry1 : thetaNormalizer.entrySet()) { double Sigma_W_ij = stringDoubleEntry1.getValue(); if (perLabelWeightSumNormalisationFactor > Math.abs(Sigma_W_ij)) { perLabelWeightSumNormalisationFactor = Math.abs(Sigma_W_ij); } } for (Map.Entry<String, Double> stringDoubleEntry : thetaNormalizer.entrySet()) { double Sigma_W_ij = stringDoubleEntry.getValue(); thetaNormalizer.put(stringDoubleEntry.getKey(), Sigma_W_ij / perLabelWeightSumNormalisationFactor); } DefaultStringifier<Map<String, Double>> mapStringifier = new DefaultStringifier<Map<String, Double>>(conf, GenericsUtil.getClass(thetaNormalizer)); String thetaNormalizationsString = mapStringifier.toString(thetaNormalizer); Map<String, Double> c = mapStringifier.fromString(thetaNormalizationsString); log.info("{}", c); conf.set("cnaivebayes.thetaNormalizations", thetaNormalizationsString); client.setConf(conf); JobClient.runJob(conf); }