Example usage for org.apache.hadoop.io DoubleWritable DoubleWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io DoubleWritable DoubleWritable.

Prototype

public DoubleWritable(double value)

Source Link

Usage

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityTest.java

License:Apache License

/**
 * tests {@link MostSimilarItemPairsMapper}
 */// www .j  a  v a  2s. co  m
public void testMostSimilarItemsPairsMapper() throws Exception {

    OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
    indexItemIDMap.put(12, 12L);
    indexItemIDMap.put(34, 34L);
    indexItemIDMap.put(56, 56L);

    Mapper<IntWritable, VectorWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock
            .createMock(Mapper.Context.class);

    context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9));

    EasyMock.replay(context);

    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
    vector.set(12, 0.2);
    vector.set(34, 1.0);
    vector.set(56, 0.9);

    MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper();
    setField(mapper, "indexItemIDMap", indexItemIDMap);
    setField(mapper, "maxSimilarItemsPerItem", 1);

    mapper.map(new IntWritable(34), new VectorWritable(vector), context);

    EasyMock.verify(context);
}

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityTest.java

License:Apache License

/**
 * tests {@link MostSimilarItemPairsReducer}
 *///  www. j  ava 2  s  .co m
public void testMostSimilarItemPairsReducer() throws Exception {
    Reducer<EntityEntityWritable, DoubleWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock
            .createMock(Reducer.Context.class);

    context.write(new EntityEntityWritable(123L, 456L), new DoubleWritable(0.5));

    EasyMock.replay(context);

    new MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L),
            Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)), context);

    EasyMock.verify(context);
}

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.MostSimilarItemPairsMapper.java

License:Apache License

@Override
protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)
        throws IOException, InterruptedException {

    int itemIDIndex = itemIDIndexWritable.get();

    Queue<SimilarItem> topMostSimilarItems = new PriorityQueue<SimilarItem>(maxSimilarItemsPerItem + 1,
            Collections.reverseOrder(SimilarItem.COMPARE_BY_SIMILARITY));

    Iterator<Vector.Element> similarityVectorIterator = similarityVector.get().iterateNonZero();

    while (similarityVectorIterator.hasNext()) {
        Vector.Element element = similarityVectorIterator.next();
        int index = element.index();
        double value = element.get();
        /* ignore self similarities */
        if (index != itemIDIndex) {
            if (topMostSimilarItems.size() < maxSimilarItemsPerItem) {
                topMostSimilarItems.add(new SimilarItem(indexItemIDMap.get(index), value));
            } else if (value > topMostSimilarItems.peek().getSimilarity()) {
                topMostSimilarItems.add(new SimilarItem(indexItemIDMap.get(index), value));
                topMostSimilarItems.poll();
            }//from   w  w  w. j a  v a 2  s.co m
        }
    }

    if (!topMostSimilarItems.isEmpty()) {
        List<SimilarItem> mostSimilarItems = new ArrayList<SimilarItem>(topMostSimilarItems.size());
        mostSimilarItems.addAll(topMostSimilarItems);
        Collections.sort(mostSimilarItems, SimilarItem.COMPARE_BY_SIMILARITY);

        long itemID = indexItemIDMap.get(itemIDIndex);
        for (SimilarItem similarItem : mostSimilarItems) {
            long otherItemID = similarItem.getItemID();
            if (itemID < otherItemID) {
                ctx.write(new EntityEntityWritable(itemID, otherItemID),
                        new DoubleWritable(similarItem.getSimilarity()));
            } else {
                ctx.write(new EntityEntityWritable(otherItemID, itemID),
                        new DoubleWritable(similarItem.getSimilarity()));
            }
        }
    }
}

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.SimilarityReducer.java

License:Apache License

@Override
public void reduce(ItemPairWritable pair, Iterator<CoRating> coRatings,
        OutputCollector<EntityEntityWritable, DoubleWritable> output, Reporter reporter) throws IOException {

    double similarity = distributedItemSimilarity.similarity(coRatings, pair.getItemAWeight(),
            pair.getItemBWeight(), numberOfUsers);

    if (!Double.isNaN(similarity)) {
        output.collect(pair.getItemItemWritable(), new DoubleWritable(similarity));
    }//from   ww  w.ja  va 2 s.c om
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerMapper.java

License:Apache License

/**
 * We need to calculate the thetaNormalization factor of each label
 *
 * @param key   The label,feature pair/*  w  ww . j  a v  a2s. c  o m*/
 * @param value The tfIdf of the pair
 */
@Override
public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter)
        throws IOException {

    String labelFeaturePair = key.toString();
    double alpha_i = 1.0;

    String label = labelFeaturePair.split(",")[0];
    double weight = Math.log((value.get() + alpha_i) / (labelWeightSum.get(label) + vocabCount));
    output.collect(new Text(('_' + label).trim()), new DoubleWritable(weight));
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    //String token = key.toString();

    double weightSumPerLabel = 0.0;

    while (values.hasNext()) {
        weightSumPerLabel += values.next().get();
    }//  w  ww .ja  v  a2  s.  co m
    output.collect(key, new DoubleWritable(weightSumPerLabel));

}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureMapper.java

License:Apache License

/**
 * We need to count the number of times we've seen a term with a given label and we need to output that. But this
 * Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it outputs for each
 * unique word in a document value 1 for summing up as the Term Document Frequency. Which later is used to calculate
 * the Idf Thirdly, it outputs for each label the number of times a document was seen(Also used in Idf Calculation)
 *
 * @param key      The label//from   ww  w.j  a v  a2 s  .com
 * @param value    the features (all unique) associated w/ this label
 * @param output   The OutputCollector to write the results to
 * @param reporter Not used
 */
@Override
public void map(Text key, Text value, OutputCollector<Text, DoubleWritable> output, Reporter reporter)
        throws IOException {
    //String line = value.toString();
    String label = key.toString();
    int keyLen = label.length();

    Map<String, int[]> wordList = new HashMap<String, int[]>(1000);

    StringBuilder builder = new StringBuilder(label);
    builder.ensureCapacity(32);// make sure we have a reasonably size buffer to
    // begin with
    List<String> ngrams = Model.generateNGramsWithoutLabel(value.toString(), gramSize);
    for (String ngram : ngrams) {
        int[] count = wordList.get(ngram);
        if (count == null) {
            count = new int[1];
            count[0] = 0;
            wordList.put(ngram, count);
        }
        count[0]++;
    }
    double lengthNormalisation = 0.0;
    for (int[] D_kj : wordList.values()) {
        // key is label,word
        double dkjValue = (double) D_kj[0];
        lengthNormalisation += dkjValue * dkjValue;
    }
    lengthNormalisation = Math.sqrt(lengthNormalisation);

    // Output Length Normalized + TF Transformed Frequency per Word per Class
    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    for (Map.Entry<String, int[]> entry : wordList.entrySet()) {
        // key is label,word
        String token = entry.getKey();
        builder.append(',').append(token);
        labelWord.set(builder.toString());
        DoubleWritable f = new DoubleWritable(Math.log(1.0 + entry.getValue()[0]) / lengthNormalisation);
        output.collect(labelWord, f);
        builder.setLength(keyLen);// truncate back
    }

    // Output Document Frequency per Word per Class
    String dflabel = '-' + label;
    int dfKeyLen = dflabel.length();
    builder = new StringBuilder(dflabel);
    for (String token : wordList.keySet()) {
        // key is label,word
        builder.append(',').append(token);
        labelWord.set(builder.toString());
        output.collect(labelWord, one);
        output.collect(new Text(',' + token), one);
        builder.setLength(dfKeyLen);// truncate back

    }

    // output that we have seen the label to calculate the Count of Document per
    // class
    output.collect(new Text('_' + label), one);
}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same

    double sum = 0.0;
    while (values.hasNext()) {
        sum += values.next().get();/*from ww  w  .  j a  v a2 s . co m*/
    }
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java

License:Apache License

/**
 * We need to calculate the Tf-Idf of each feature in each label
 *
 * @param key The label,feature pair (can either be the freq Count or the term Document count
 *///from w w  w. ja  v  a  2  s  .  c  o m
@Override
public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter)
        throws IOException {

    String labelFeaturePair = key.toString();

    char firstChar = labelFeaturePair.charAt(0);
    switch (firstChar) {
    case '-': {// if it is the termDocumentCount
        labelFeaturePair = labelFeaturePair.substring(1);
        //-17th_century_mathematicians_anderson__alexander,1582
        int idx = labelFeaturePair.indexOf(",");
        if (idx != -1) {
            String label = labelFeaturePair.substring(0, idx);

            Double labelDocumentCount = labelDocumentCounts.get(label);
            if (labelDocumentCount == null) {
                throw new IOException("Invalid label: " + label);
            }
            double logIdf = Math.log(labelDocumentCount / value.get());
            output.collect(new Text(labelFeaturePair), new DoubleWritable(logIdf));
        } else {
            throw new IOException("Invalid ");
        }
        break;
    }
    case ',': {
        output.collect(new Text("*vocabCount"), new DoubleWritable(1.0));
        break;
    }
    default: {
        output.collect(key, value);
        break;
    }
    }
}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
    String token = key.toString();
    if (token.startsWith("*vocabCount")) {
        double vocabCount = 0.0;
        while (values.hasNext()) {
            vocabCount += values.next().get();
        }//from ww  w.j  a  v  a2 s  .  c  o  m
        log.info("{}\t{}", token, vocabCount);
        output.collect(key, new DoubleWritable(vocabCount));
    } else {
        double idfTimes_D_ij = 1.0;
        //int numberofValues = 0;
        while (values.hasNext()) {
            idfTimes_D_ij *= values.next().get();
            //numberofValues ++;
        }
        //if(numberofValues!=2) throw new IOException("Number of values should be exactly 2");

        output.collect(key, new DoubleWritable(idfTimes_D_ij));
    }
}