Example usage for org.apache.hadoop.io DoubleWritable DoubleWritable

List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io DoubleWritable DoubleWritable.

Prototype

public DoubleWritable(double value) 

Source Link

Usage

From source file:org.apache.mahout.classifier.bayes.common.BayesWeightSummerReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    //Key is label,word, value is the tfidf of the feature  of times we've seen this label word per local node.  Output is the same

    double sum = 0.0;
    while (values.hasNext()) {
        sum += values.next().get();//from w ww .  jav a 2 s .  co  m
    }
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierMapper.java

License:Apache License

/**
 * Parallel Classification/*www  . jav a 2  s.c  o  m*/
 * 
 * @param key
 *          The label
 * @param value
 *          the features (all unique) associated w/ this label
 * @param output
 *          The OutputCollector to write the results to
 * @param reporter
 *          Reports status back to hadoop
 */
@Override
public void map(Text key, Text value, OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter)
        throws IOException {
    String label = key.toString();

    List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();

    try {
        ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]),
                defaultCategory);

        String correctLabel = label;
        String classifiedLabel = result.getLabel();

        StringTuple outputTuple = new StringTuple(BayesConstants.CLASSIFIER_TUPLE);
        outputTuple.add(correctLabel);
        outputTuple.add(classifiedLabel);

        output.collect(outputTuple, new DoubleWritable(1.0));
    } catch (InvalidDatastoreException e) {
        throw new IOException(e.toString());
    }
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label word per local node. Output is
    // the same// w ww.  java2s  .  c  o  m

    double sum = 0.0;
    while (values.hasNext()) {
        reporter.setStatus("Classifier Reducer:" + key);
        sum += values.next().get();
    }
    reporter.setStatus("Bayes Classifier Reducer: " + key + " => " + sum);
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerMapper.java

License:Apache License

/**
 * We need to calculate the thetaNormalization factor of each label
 * //  w  ww.  j a  v  a  2 s.co m
 * @param key
 *          The label,feature pair
 * @param value
 *          The tfIdf of the pair
 */
@Override
public void map(StringTuple key, DoubleWritable value, OutputCollector<StringTuple, DoubleWritable> output,
        Reporter reporter) throws IOException {

    String label = key.stringAt(1);

    reporter.setStatus("Bayes Theta Normalizer Mapper: " + label);

    double weight = Math.log((value.get() + alphaI) / (labelWeightSum.get(label) + vocabCount));
    StringTuple thetaNormalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
    thetaNormalizerTuple.add(label);
    output.collect(thetaNormalizerTuple, new DoubleWritable(weight));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesThetaNormalizerReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    // String token = key.toString();

    double weightSumPerLabel = 0.0;

    while (values.hasNext()) {
        reporter.setStatus("Bayes Theta Normalizer Reducer: " + key);
        weightSumPerLabel += values.next().get();
    }/*from  w  w w  .j ava2 s  . c  o m*/
    reporter.setStatus("Bayes Theta Normalizer Reducer: " + key + " => " + weightSumPerLabel);
    if (useHbase) {
        if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) {
            String label = key.stringAt(1);
            Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_THETA_NORMALIZER));
            bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes.toBytes(label),
                    Bytes.toBytes(weightSumPerLabel));
            table.put(bu);
        }
    }
    output.collect(key, new DoubleWritable(weightSumPerLabel));

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerMapper.java

License:Apache License

/**
 * We need to calculate the idf of each feature in each label
 * //from  w  w w. j  av  a  2  s.  co m
 * @param key
 *          The label,feature pair (can either be the freq Count or the term Document count
 */
@Override
public void map(StringTuple key, final DoubleWritable value,
        final OutputCollector<StringTuple, DoubleWritable> output, final Reporter reporter) throws IOException {

    if (key.stringAt(0).equals(BayesConstants.FEATURE_SUM)) { // if it is from
        // the Sigma_j
        // folder
        labelWeightSum.forEachPair(new ObjectDoubleProcedure<String>() {

            @Override
            public boolean apply(String label, double sigmaJ) {
                double weight = Math.log((value.get() + alphaI) / (sigmaJSigmaK - sigmaJ + vocabCount));

                reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);
                StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
                normalizerTuple.add(label);
                try {
                    output.collect(normalizerTuple, new DoubleWritable(weight));
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                } // output Sigma_j
                return true;
            }
        });

    } else {
        String label = key.stringAt(1);

        double dIJ = value.get();
        double denominator = 0.5 * (sigmaJSigmaK / vocabCount + dIJ * this.labelWeightSum.size());
        double weight = Math.log(1.0 - dIJ / denominator);

        reporter.setStatus("Complementary Bayes Theta Normalizer Mapper: " + label + " => " + weight);

        StringTuple normalizerTuple = new StringTuple(BayesConstants.LABEL_THETA_NORMALIZER);
        normalizerTuple.add(label);

        // output -D_ij
        output.collect(normalizerTuple, new DoubleWritable(weight));

    }

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.cbayes.CBayesThetaNormalizerReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    double weightSumPerLabel = 0.0;

    while (values.hasNext()) {
        reporter.setStatus("Complementary Bayes Theta Normalizer Reducer: " + key);
        weightSumPerLabel += values.next().get();
    }/*from w ww.ja  v a  2s .  c  o m*/
    reporter.setStatus("Complementary Bayes Theta Normalizer Reducer: " + key + " => " + weightSumPerLabel);

    if (useHbase) {
        if (key.stringAt(0).equals(BayesConstants.LABEL_THETA_NORMALIZER)) {
            String label = key.stringAt(1);
            Put bu = new Put(Bytes.toBytes(BayesConstants.LABEL_THETA_NORMALIZER));
            bu.add(Bytes.toBytes(BayesConstants.HBASE_COLUMN_FAMILY), Bytes.toBytes(label),
                    Bytes.toBytes(weightSumPerLabel));
            table.put(bu);
        }
    }
    output.collect(key, new DoubleWritable(weightSumPerLabel));

}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureCombiner.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    double sum = 0.0;
    while (values.hasNext()) {
        reporter.setStatus("Feature Combiner:" + key);
        sum += values.next().get();//www .ja v a  2  s .c  om
    }
    reporter.setStatus("Bayes Feature Combiner: " + key + " => " + sum);
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper.java

License:Apache License

/**
 * We need to count the number of times we've seen a term with a given label and we need to output that. But
 * this Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it
 * outputs for each unique word in a document value 1 for summing up as the Term Document Frequency. Which
 * later is used to calculate the Idf Thirdly, it outputs for each label the number of times a document was
 * seen(Also used in Idf Calculation)/*  w  w w. j ava2s.c  om*/
 * 
 * @param key
 *          The label
 * @param value
 *          the features (all unique) associated w/ this label in stringtuple format
 * @param output
 *          The OutputCollector to write the results to
 * @param reporter
 *          Not used
 */
@Override
public void map(Text key, Text value, final OutputCollector<StringTuple, DoubleWritable> output,
        Reporter reporter) throws IOException {
    // String line = value.toString();
    final String label = key.toString();
    String[] tokens = SPACE_PATTERN.split(value.toString());
    OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);

    if (gramSize > 1) {
        ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)),
                gramSize);
        do {
            String term = (sf.getAttribute(TermAttribute.class)).term();
            if (term.length() > 0) {
                if (wordList.containsKey(term)) {
                    wordList.put(term, 1 + wordList.get(term));
                } else {
                    wordList.put(term, 1);
                }
            }
        } while (sf.incrementToken());
    } else {
        for (String term : tokens) {
            if (wordList.containsKey(term)) {
                wordList.put(term, 1 + wordList.get(term));
            } else {
                wordList.put(term, 1);
            }
        }
    }
    final MutableDouble lengthNormalisationMut = new MutableDouble(0.0);
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String word, int dKJ) {
            lengthNormalisationMut.add(dKJ * dKJ);
            return true;
        }
    });

    final double lengthNormalisation = Math.sqrt(lengthNormalisationMut.doubleValue());

    // Output Length Normalized + TF Transformed Frequency per Word per Class
    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String token, int dKJ) {
            try {
                StringTuple tuple = new StringTuple();
                tuple.add(BayesConstants.WEIGHT);
                tuple.add(label);
                tuple.add(token);
                DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation);
                output.collect(tuple, f);
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            return true;
        }
    });
    reporter.setStatus("Bayes Feature Mapper: Document Label: " + label);

    // Output Document Frequency per Word per Class
    // Corpus Document Frequency (FEATURE_COUNT)
    // Corpus Term Frequency (FEATURE_TF)
    wordList.forEachPair(new ObjectIntProcedure<String>() {
        @Override
        public boolean apply(String token, int dKJ) {
            try {
                StringTuple dfTuple = new StringTuple();
                dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY);
                dfTuple.add(label);
                dfTuple.add(token);
                output.collect(dfTuple, ONE);

                StringTuple tokenCountTuple = new StringTuple();
                tokenCountTuple.add(BayesConstants.FEATURE_COUNT);
                tokenCountTuple.add(token);
                output.collect(tokenCountTuple, ONE);

                StringTuple tokenTfTuple = new StringTuple();
                tokenTfTuple.add(BayesConstants.FEATURE_TF);
                tokenTfTuple.add(token);
                output.collect(tokenTfTuple, new DoubleWritable(dKJ));
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            return true;
        }
    });

    // output that we have seen the label to calculate the Count of Document per
    // class
    StringTuple labelCountTuple = new StringTuple();
    labelCountTuple.add(BayesConstants.LABEL_COUNT);
    labelCountTuple.add(label);
    output.collect(labelCountTuple, ONE);
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureReducer.java

License:Apache License

@Override
public void reduce(StringTuple key, Iterator<DoubleWritable> values,
        OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException {

    // StringTuple key is either:
    // type, word        for type=FEATURE_COUNT, FEATURE_TF or WEIGHT tuples
    // type, label       for type=LABEL_COUNT_TUPLES
    // type, label, word for type=DOCUMENT_FREQUENCY tuples

    double sum = 0.0;
    while (values.hasNext()) {
        reporter.setStatus("Feature Reducer:" + key);
        sum += values.next().get();//from ww w . j a  v a  2s  . c  o  m
    }
    reporter.setStatus("Bayes Feature Reducer: " + key + " => " + sum);

    if (2 > key.length() || key.length() > 3) {
        throw new IllegalArgumentException("StringTuple length out of bounds, not (2 < length < 3)");
    }

    int featureIndex = key.length() == 2 ? 1 : 2;

    // FeatureLabelComparator guarantees that for a given label, we will
    // see FEATURE_TF items first, FEATURE_COUNT items second, 
    // DOCUMENT_FREQUENCY items next and finally WEIGHT items, while
    // the FeaturePartitioner guarantees that all tuples containing a given term
    // will be handled by the same reducer.
    if (key.stringAt(0).equals(BayesConstants.LABEL_COUNT)) {
        /* no-op, just collect */
    } else if (key.stringAt(0).equals(BayesConstants.FEATURE_TF)) {
        currentDfFeature = key.stringAt(1);
        currentCorpusTf = sum;
        currentCorpusDf = -1;

        if (0 < minSupport && currentCorpusTf < minSupport) {
            reporter.incrCounter("skipped", "less_than_minSupport", 1);
        }
        return; // never emit FEATURE_TF tuples.
    } else if (!key.stringAt(featureIndex).equals(currentDfFeature)) {
        throw new IllegalStateException("Found feature data " + key + " prior to feature tf");
    } else if (0 < minSupport && currentCorpusTf < minSupport) {
        reporter.incrCounter("skipped", "less_than_minSupport_label-term", 1);
        return; // skip items that have less than a specified frequency.
    } else if (key.stringAt(0).equals(BayesConstants.FEATURE_COUNT)) {
        currentCorpusDf = sum;

        if (0 < minDf && currentCorpusDf < minDf) {
            reporter.incrCounter("skipped", "less_than_minDf", 1);
            return; // skip items that have less than the specified minSupport.
        }
    } else if (currentCorpusDf == -1) {
        throw new IllegalStateException("Found feature data " + key + " prior to feature count");
    } else if (0 < minDf && currentCorpusDf < minDf) {
        reporter.incrCounter("skipped", "less_than_minDf_label-term", 1);
        return; // skip items that have less than a specified frequency.
    }
    output.collect(key, new DoubleWritable(sum));
}