Example usage for org.apache.hadoop.io DoubleWritable DoubleWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io DoubleWritable DoubleWritable.

Prototype

public DoubleWritable(double value)

Source Link

Usage

From source file:org.apache.mahout.classifier.bayes.common.BayesWeightSummerReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    //Key is label,word, value is the tfidf of the feature  of times we've seen this label word per local node.  Output is the same

    double sum = 0.0;
    while (values.hasNext()) {
        sum += values.next().get();//from w ww .  jav a 2 s .  co  m
    }
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierMapper.java

License:Apache License

/**
 * Parallel Classification/*www  . jav a 2  s.c  o  m*/
 * 
 * @param key
 *          The label
 * @param value
 *          the features (all unique) associated w/ this label
 * @param output
 *          The OutputCollector to write the results to
 * @param reporter
 *          Reports status back to hadoop
 */
@Override
public void map(Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
        throws IOException {
    String label = key.toString();

    List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();

    try {
        ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]),
                defaultCategory);

        String correctLabel = label;
        String classifiedLabel = result.getLabel();

        StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE);
        outputTuple.add(correctLabel);
        outputTuple.add(classifiedLabel);

        output.collect(outputTuple, new DoubleWritable(1.0));
    } catch (InvalidDatastoreException e) {
        throw new IOException(e.toString());
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label word per local node. Output is
    // the same// w ww.  java2s  .  c  o  m

    double sum = 0.0;
    while (values.hasNext()) {
        reporter.setStatus("Classifier Reducer:" + key);
        sum += values.next().get();
    }
    reporter.setStatus("Bayes Classifier Reducer: " + key + " => " + sum);
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper.java

License:Apache License

/**
 * We need to calculate the thetaNormalization factor of each label
 * //  w  ww.  j a  v  a  2 s.co m
 * @param key
 *          The label,feature pair
 * @param value
 *          The tfIdf of the pair
 */
@Override
public void map(StringTuple key, DoubleWritable value, OutputCollector<StringTuple, DoubleWritable> output,
        Reporter reporter) throws IOException {

    String label = key.stringAt(1);

    reporter.setStatus("Bayes Theta Normalizer Mapper: " + label);

    double weight = Math.log((value.get() + alphaI) / (labelWeightSum.get(label) + vocabCount));
    StringTuple thetaNormalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
    thetaNormalizerTuple.add(label);
    output.collect(thetaNormalizerTuple, new DoubleWritable(weight));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    // String token = key.toString();

    double weightSumPerLabel = 0.0;

    while (values.hasNext()) {
        reporter.setStatus("Bayes Theta Normalizer Reducer: " + key);
        weightSumPerLabel += values.next().get();
    }/*from  w  w w  .j ava2 s  . c  o m*/
    reporter.setStatus("Bayes Theta Normalizer Reducer: " + key + " => " + weightSumPerLabel);
    if (useHbase) {
        if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) {
            String label = key.stringAt(1);
            Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_THETA_NORMALIZER));
            bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes.toBytes(label),
                    Bytes.toBytes(weightSumPerLabel));
            table.put(bu);
        }
    }
    output.collect(key, new DoubleWritable(weightSumPerLabel));

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper.java

License:Apache License

/**
 * We need to calculate the idf of each feature in each label
 * //from  w  w w. j  av  a  2  s.  co m
 * @param key
 *          The label,feature pair (can either be the freq Count or the term Document count
 */
@Override
public void map(StringTuple key, final DoubleWritable value,
        final OutputCollector<StringTuple, DoubleWritable> output, final Reporter reporter) throws IOException {

    if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) { // if it is from
        // the Sigma_j
        // folder
        labelWeightSum.forEachPair(new ObjectDoubleProcedure<String>() {

            @Override
            public boolean apply(String label, double sigmaJ) {
                double weight = Math.log((value.get() + alphaI) / (sigmaJSigmaK - sigmaJ + vocabCount));

                reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);
                StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
                normalizerTuple.add(label);
                try {
                    output.collect(normalizerTuple, new DoubleWritable(weight));
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } // output Sigma_j
                return true;
            }
        });

    } else {
        String label = key.stringAt(1);

        double dIJ = value.get();
        double denominator = 0.5 * (sigmaJSigmaK / vocabCount + dIJ * this.labelWeightSum.size());
        double weight = Math.log(1.0 - dIJ / denominator);

        reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);

        StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
        normalizerTuple.add(label);

        // output -D_ij
        output.collect(normalizerTuple, new DoubleWritable(weight));

    }

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    double weightSumPerLabel = 0.0;

    while (values.hasNext()) {
        reporter.setStatus("Complementary Bayes Theta Normalizer Reducer: " + key);
        weightSumPerLabel += values.next().get();
    }/*from w ww.ja  v a  2s .  c  o m*/
    reporter.setStatus("Complementary Bayes Theta Normalizer Reducer: " + key + " => " + weightSumPerLabel);

    if (useHbase) {
        if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) {
            String label = key.stringAt(1);
            Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_THETA_NORMALIZER));
            bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes.toBytes(label),
                    Bytes.toBytes(weightSumPerLabel));
            table.put(bu);
        }
    }
    output.collect(key, new DoubleWritable(weightSumPerLabel));

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureCombiner.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    double sum = 0.0;
    while (values.hasNext()) {
        reporter.setStatus("Feature Combiner:" + key);
        sum += values.next().get();//www .ja v a  2  s .c  om
    }
    reporter.setStatus("Bayes Feature Combiner: " + key + " => " + sum);
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper.java

License:Apache License

/**
 * We need to count the number of times we've seen a term with a given label and we need to output that. But
 * this Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it
 * outputs for each unique word in a document value 1 for summing up as the Term Document Frequency. Which
 * later is used to calculate the Idf Thirdly, it outputs for each label the number of times a document was
 * seen(Also used in Idf Calculation)/*  w  w w. j ava2s.c  om*/
 * 
 * @param key
 *          The label
 * @param value
 *          the features (all unique) associated w/ this label in stringtuple format
 * @param output
 *          The OutputCollector to write the results to
 * @param reporter
 *          Not used
 */
@Override
public void map(Text key, Text value, final OutputCollector<StringTuple, DoubleWritable> output,
        Reporter reporter) throws IOException {
    // String line = value.toString();
    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);

    if (gramSize > 1) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)),
                gramSize);
        do {
            String term = (sf.getAttribute(TermAttribute.class)).term();
            if (term.length() > 0) {
                if (wordList.containsKey(term)) {
                    wordList.put(term, 1 + wordList.get(term));
                } else {
                    wordList.put(term, 1);
                }
            }
        } while (sf.incrementToken());
    } else {
        for (String term : tokens) {
            if (wordList.containsKey(term)) {
                wordList.put(term, 1 + wordList.get(term));
            } else {
                wordList.put(term, 1);
            }
        }
    }
    final MutableDouble lengthNormalisationMut = new MutableDouble(0.0);
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String word, int dKJ) {
            lengthNormalisationMut.add(dKJ * dKJ);
            return true;
        }
    });

    final double lengthNormalisation = Math.sqrt(lengthNormalisationMut.doubleValue());

    // Output Length Normalized + TF Transformed Frequency per Word per Class
    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String token, int dKJ) {
            try {
                StringTuple tuple = new StringTuple();
                tuple.add(BayesConstants.WEIGHT);
                tuple.add(label);
                tuple.add(token);
                DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation);
                output.collect(tuple, f);
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            return true;
        }
    });
    reporter.setStatus("Bayes Feature Mapper: Document Label: " + label);

    // Output Document Frequency per Word per Class
    // Corpus Document Frequency (FEATURE_COUNT)
    // Corpus Term Frequency (FEATURE_TF)
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String token, int dKJ) {
            try {
                StringTuple dfTuple = new StringTuple();
                dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY);
                dfTuple.add(label);
                dfTuple.add(token);
                output.collect(dfTuple, ONE);

                StringTuple tokenCountTuple = new StringTuple();
                tokenCountTuple.add(BayesConstants.FEATURE_COUNT);
                tokenCountTuple.add(token);
                output.collect(tokenCountTuple, ONE);

                StringTuple tokenTfTuple = new StringTuple();
                tokenTfTuple.add(BayesConstants.FEATURE_TF);
                tokenTfTuple.add(token);
                output.collect(tokenTfTuple, new DoubleWritable(dKJ));
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            return true;
        }
    });

    // output that we have seen the label to calculate the Count of Document per
    // class
    StringTuple labelCountTuple = new StringTuple();
    labelCountTuple.add(BayesConstants.LABEL_COUNT);
    labelCountTuple.add(label);
    output.collect(labelCountTuple, ONE);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {

    // StringTuple key is either:
    // type, word        for type=FEATURE_COUNT, FEATURE_TF or WEIGHT tuples
    // type, label       for type=LABEL_COUNT_TUPLES
    // type, label, word for type=DOCUMENT_FREQUENCY tuples

    double sum = 0.0;
    while (values.hasNext()) {
        reporter.setStatus("Feature Reducer:" + key);
        sum += values.next().get();//from ww w . j a  v a  2s  . c  o  m
    }
    reporter.setStatus("Bayes Feature Reducer: " + key + " => " + sum);

    if (2 > key.length() || key.length() > 3) {
        throw new IllegalArgumentException("StringTuple length out of bounds, not (2 < length < 3)");
    }

    int featureIndex = key.length() == 2 ? 1 : 2;

    // FeatureLabelComparator guarantees that for a given label, we will
    // see FEATURE_TF items first, FEATURE_COUNT items second, 
    // DOCUMENT_FREQUENCY items next and finally WEIGHT items, while
    // the FeaturePartitioner guarantees that all tuples containing a given term
    // will be handled by the same reducer.
    if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) {
        /* no-op, just collect */
    } else if (key.stringAt(0).equals(BayesConstants.FEATURE_TF)) {
        currentDfFeature = key.stringAt(1);
        currentCorpusTf = sum;
        currentCorpusDf = -1;

        if (0 < minSupport && currentCorpusTf < minSupport) {
            reporter.incrCounter("skipped", "less_than_minSupport", 1);
        }
        return; // never emit FEATURE_TF tuples.
    } else if (!key.stringAt(featureIndex).equals(currentDfFeature)) {
        throw new IllegalStateException("Found feature data " + key + " prior to feature tf");
    } else if (0 < minSupport && currentCorpusTf < minSupport) {
        reporter.incrCounter("skipped", "less_than_minSupport_label-term", 1);
        return; // skip items that have less than a specified frequency.
    } else if (key.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
        currentCorpusDf = sum;

        if (0 < minDf && currentCorpusDf < minDf) {
            reporter.incrCounter("skipped", "less_than_minDf", 1);
            return; // skip items that have less than the specified minSupport.
        }
    } else if (currentCorpusDf == -1) {
        throw new IllegalStateException("Found feature data " + key + " prior to feature count");
    } else if (0 < minDf && currentCorpusDf < minDf) {
        reporter.incrCounter("skipped", "less_than_minDf_label-term", 1);
        return; // skip items that have less than a specified frequency.
    }
    output.collect(key, new DoubleWritable(sum));
}