List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable
public DoubleWritable(double value)
From source file:org.apache.mahout.classifier.bayes.common.BayesWeightSummerReducer.java
License:Apache License
@Override public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { //Key is label,word, value is the tfidf of the feature of times we've seen this label word per local node. Output is the same double sum = 0.0; while (values.hasNext()) { sum += values.next().get();//from w ww . jav a 2 s . co m } output.collect(key, new DoubleWritable(sum)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierMapper.java
License:Apache License
/** * Parallel Classification/*www . jav a 2 s.c o m*/ * * @param key * The label * @param value * the features (all unique) associated w/ this label * @param output * The OutputCollector to write the results to * @param reporter * Reports status back to hadoop */ @Override public void map(Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { String label = key.toString(); List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel(); try { ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]), defaultCategory); String correctLabel = label; String classifiedLabel = result.getLabel(); StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE); outputTuple.add(correctLabel); outputTuple.add(classifiedLabel); output.collect(outputTuple, new DoubleWritable(1.0)); } catch (InvalidDatastoreException e) { throw new IOException(e.toString()); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierReducer.java
License:Apache License
@Override public void reduce(StringTuple key, Iterator<DoubleWritable> values, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // Key is label,word, value is the number of times we've seen this label word per local node. Output is // the same// w ww. java2s . c o m double sum = 0.0; while (values.hasNext()) { reporter.setStatus("Classifier Reducer:" + key); sum += values.next().get(); } reporter.setStatus("Bayes Classifier Reducer: " + key + " => " + sum); output.collect(key, new DoubleWritable(sum)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper.java
License:Apache License
/** * We need to calculate the thetaNormalization factor of each label * // w ww. j a v a 2 s.co m * @param key * The label,feature pair * @param value * The tfIdf of the pair */ @Override public void map(StringTuple key, DoubleWritable value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { String label = key.stringAt(1); reporter.setStatus("Bayes Theta Normalizer Mapper: " + label); double weight = Math.log((value.get() + alphaI) / (labelWeightSum.get(label) + vocabCount)); StringTuple thetaNormalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER); thetaNormalizerTuple.add(label); output.collect(thetaNormalizerTuple, new DoubleWritable(weight)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerReducer.java
License:Apache License
@Override public void reduce(StringTuple key, Iterator<DoubleWritable> values, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // Key is label,word, value is the number of times we've seen this label // word per local node. Output is the same // String token = key.toString(); double weightSumPerLabel = 0.0; while (values.hasNext()) { reporter.setStatus("Bayes Theta Normalizer Reducer: " + key); weightSumPerLabel += values.next().get(); }/*from w w w .j ava2 s . c o m*/ reporter.setStatus("Bayes Theta Normalizer Reducer: " + key + " => " + weightSumPerLabel); if (useHbase) { if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) { String label = key.stringAt(1); Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_THETA_NORMALIZER)); bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes.toBytes(label), Bytes.toBytes(weightSumPerLabel)); table.put(bu); } } output.collect(key, new DoubleWritable(weightSumPerLabel)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper.java
License:Apache License
/** * We need to calculate the idf of each feature in each label * //from w w w. j av a 2 s. co m * @param key * The label,feature pair (can either be the freq Count or the term Document count */ @Override public void map(StringTuple key, final DoubleWritable value, final OutputCollector<StringTuple, DoubleWritable> output, final Reporter reporter) throws IOException { if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) { // if it is from // the Sigma_j // folder labelWeightSum.forEachPair(new ObjectDoubleProcedure<String>() { @Override public boolean apply(String label, double sigmaJ) { double weight = Math.log((value.get() + alphaI) / (sigmaJSigmaK - sigmaJ + vocabCount)); reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight); StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER); normalizerTuple.add(label); try { output.collect(normalizerTuple, new DoubleWritable(weight)); } catch (IOException e) { throw new IllegalStateException(e); } // output Sigma_j return true; } }); } else { String label = key.stringAt(1); double dIJ = value.get(); double denominator = 0.5 * (sigmaJSigmaK / vocabCount + dIJ * this.labelWeightSum.size()); double weight = Math.log(1.0 - dIJ / denominator); reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight); StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER); normalizerTuple.add(label); // output -D_ij output.collect(normalizerTuple, new DoubleWritable(weight)); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerReducer.java
License:Apache License
@Override public void reduce(StringTuple key, Iterator<DoubleWritable> values, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // Key is label,word, value is the number of times we've seen this label // word per local node. Output is the same double weightSumPerLabel = 0.0; while (values.hasNext()) { reporter.setStatus("Complementary Bayes Theta Normalizer Reducer: " + key); weightSumPerLabel += values.next().get(); }/*from w ww.ja v a 2s . c o m*/ reporter.setStatus("Complementary Bayes Theta Normalizer Reducer: " + key + " => " + weightSumPerLabel); if (useHbase) { if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) { String label = key.stringAt(1); Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_THETA_NORMALIZER)); bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes.toBytes(label), Bytes.toBytes(weightSumPerLabel)); table.put(bu); } } output.collect(key, new DoubleWritable(weightSumPerLabel)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureCombiner.java
License:Apache License
@Override public void reduce(StringTuple key, Iterator<DoubleWritable> values, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // Key is label,word, value is the number of times we've seen this label // word per local node. Output is the same double sum = 0.0; while (values.hasNext()) { reporter.setStatus("Feature Combiner:" + key); sum += values.next().get();//www .ja v a 2 s .c om } reporter.setStatus("Bayes Feature Combiner: " + key + " => " + sum); output.collect(key, new DoubleWritable(sum)); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper.java
License:Apache License
/** * We need to count the number of times we've seen a term with a given label and we need to output that. But * this Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it * outputs for each unique word in a document value 1 for summing up as the Term Document Frequency. Which * later is used to calculate the Idf Thirdly, it outputs for each label the number of times a document was * seen(Also used in Idf Calculation)/* w w w. j ava2s.c om*/ * * @param key * The label * @param value * the features (all unique) associated w/ this label in stringtuple format * @param output * The OutputCollector to write the results to * @param reporter * Not used */ @Override public void map(Text key, Text value, final OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // String line = value.toString(); final String label = key.toString(); String[] tokens = SPACE_PATTERN.split(value.toString()); OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize); if (gramSize > 1) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)), gramSize); do { String term = (sf.getAttribute(TermAttribute.class)).term(); if (term.length() > 0) { if (wordList.containsKey(term)) { wordList.put(term, 1 + wordList.get(term)); } else { wordList.put(term, 1); } } } while (sf.incrementToken()); } else { for (String term : tokens) { if (wordList.containsKey(term)) { wordList.put(term, 1 + wordList.get(term)); } else { wordList.put(term, 1); } } } final MutableDouble lengthNormalisationMut = new MutableDouble(0.0); wordList.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String word, int dKJ) { lengthNormalisationMut.add(dKJ * dKJ); return true; } }); final double lengthNormalisation = Math.sqrt(lengthNormalisationMut.doubleValue()); // Output Length Normalized + TF Transformed Frequency per Word per Class // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) ) wordList.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String token, int dKJ) { try { StringTuple tuple = new StringTuple(); tuple.add(BayesConstants.WEIGHT); tuple.add(label); tuple.add(token); DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation); output.collect(tuple, f); } catch (IOException e) { throw new IllegalStateException(e); } return true; } }); reporter.setStatus("Bayes Feature Mapper: Document Label: " + label); // Output Document Frequency per Word per Class // Corpus Document Frequency (FEATURE_COUNT) // Corpus Term Frequency (FEATURE_TF) wordList.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String token, int dKJ) { try { StringTuple dfTuple = new StringTuple(); dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY); dfTuple.add(label); dfTuple.add(token); output.collect(dfTuple, ONE); StringTuple tokenCountTuple = new StringTuple(); tokenCountTuple.add(BayesConstants.FEATURE_COUNT); tokenCountTuple.add(token); output.collect(tokenCountTuple, ONE); StringTuple tokenTfTuple = new StringTuple(); tokenTfTuple.add(BayesConstants.FEATURE_TF); tokenTfTuple.add(token); output.collect(tokenTfTuple, new DoubleWritable(dKJ)); } catch (IOException e) { throw new IllegalStateException(e); } return true; } }); // output that we have seen the label to calculate the Count of Document per // class StringTuple labelCountTuple = new StringTuple(); labelCountTuple.add(BayesConstants.LABEL_COUNT); labelCountTuple.add(label); output.collect(labelCountTuple, ONE); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureReducer.java
License:Apache License
@Override public void reduce(StringTuple key, Iterator<DoubleWritable> values, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // StringTuple key is either: // type, word for type=FEATURE_COUNT, FEATURE_TF or WEIGHT tuples // type, label for type=LABEL_COUNT_TUPLES // type, label, word for type=DOCUMENT_FREQUENCY tuples double sum = 0.0; while (values.hasNext()) { reporter.setStatus("Feature Reducer:" + key); sum += values.next().get();//from ww w . j a v a 2s . c o m } reporter.setStatus("Bayes Feature Reducer: " + key + " => " + sum); if (2 > key.length() || key.length() > 3) { throw new IllegalArgumentException("StringTuple length out of bounds, not (2 < length < 3)"); } int featureIndex = key.length() == 2 ? 1 : 2; // FeatureLabelComparator guarantees that for a given label, we will // see FEATURE_TF items first, FEATURE_COUNT items second, // DOCUMENT_FREQUENCY items next and finally WEIGHT items, while // the FeaturePartitioner guarantees that all tuples containing a given term // will be handled by the same reducer. if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) { /* no-op, just collect */ } else if (key.stringAt(0).equals(BayesConstants.FEATURE_TF)) { currentDfFeature = key.stringAt(1); currentCorpusTf = sum; currentCorpusDf = -1; if (0 < minSupport && currentCorpusTf < minSupport) { reporter.incrCounter("skipped", "less_than_minSupport", 1); } return; // never emit FEATURE_TF tuples. } else if (!key.stringAt(featureIndex).equals(currentDfFeature)) { throw new IllegalStateException("Found feature data " + key + " prior to feature tf"); } else if (0 < minSupport && currentCorpusTf < minSupport) { reporter.incrCounter("skipped", "less_than_minSupport_label-term", 1); return; // skip items that have less than a specified frequency. } else if (key.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) { currentCorpusDf = sum; if (0 < minDf && currentCorpusDf < minDf) { reporter.incrCounter("skipped", "less_than_minDf", 1); return; // skip items that have less than the specified minSupport. } } else if (currentCorpusDf == -1) { throw new IllegalStateException("Found feature data " + key + " prior to feature count"); } else if (0 < minDf && currentCorpusDf < minDf) { reporter.incrCounter("skipped", "less_than_minDf_label-term", 1); return; // skip items that have less than a specified frequency. } output.collect(key, new DoubleWritable(sum)); }