Example usage for org.apache.hadoop.io DoubleWritable DoubleWritable

List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable

Introduction

In this page you can find the example usage for org.apache.hadoop.io DoubleWritable DoubleWritable.

Prototype

public DoubleWritable(double value) 

Source Link

Usage

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityTest.java

License:Apache License

/**
 * tests {@link MostSimilarItemPairsMapper}
 */// www .j  a  v a  2s. co  m
public void testMostSimilarItemsPairsMapper() throws Exception {

    OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap();
    indexItemIDMap.put(12, 12L);
    indexItemIDMap.put(34, 34L);
    indexItemIDMap.put(56, 56L);

    Mapper<IntWritable, VectorWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock
            .createMock(Mapper.Context.class);

    context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9));

    EasyMock.replay(context);

    Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
    vector.set(12, 0.2);
    vector.set(34, 1.0);
    vector.set(56, 0.9);

    MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper();
    setField(mapper, "indexItemIDMap", indexItemIDMap);
    setField(mapper, "maxSimilarItemsPerItem", 1);

    mapper.map(new IntWritable(34), new VectorWritable(vector), context);

    EasyMock.verify(context);
}

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityTest.java

License:Apache License

/**
 * tests {@link MostSimilarItemPairsReducer}
 *///  www. j  ava 2  s  .co m
public void testMostSimilarItemPairsReducer() throws Exception {
    Reducer<EntityEntityWritable, DoubleWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock
            .createMock(Reducer.Context.class);

    context.write(new EntityEntityWritable(123L, 456L), new DoubleWritable(0.5));

    EasyMock.replay(context);

    new MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L),
            Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)), context);

    EasyMock.verify(context);
}

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.MostSimilarItemPairsMapper.java

License:Apache License

@Override
protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)
        throws IOException, InterruptedException {

    int itemIDIndex = itemIDIndexWritable.get();

    Queue<SimilarItem> topMostSimilarItems = new PriorityQueue<SimilarItem>(maxSimilarItemsPerItem + 1,
            Collections.reverseOrder(SimilarItem.COMPARE_BY_SIMILARITY));

    Iterator<Vector.Element> similarityVectorIterator = similarityVector.get().iterateNonZero();

    while (similarityVectorIterator.hasNext()) {
        Vector.Element element = similarityVectorIterator.next();
        int index = element.index();
        double value = element.get();
        /* ignore self similarities */
        if (index != itemIDIndex) {
            if (topMostSimilarItems.size() < maxSimilarItemsPerItem) {
                topMostSimilarItems.add(new SimilarItem(indexItemIDMap.get(index), value));
            } else if (value > topMostSimilarItems.peek().getSimilarity()) {
                topMostSimilarItems.add(new SimilarItem(indexItemIDMap.get(index), value));
                topMostSimilarItems.poll();
            }//from   w  w  w. j a  v a 2  s.co m
        }
    }

    if (!topMostSimilarItems.isEmpty()) {
        List<SimilarItem> mostSimilarItems = new ArrayList<SimilarItem>(topMostSimilarItems.size());
        mostSimilarItems.addAll(topMostSimilarItems);
        Collections.sort(mostSimilarItems, SimilarItem.COMPARE_BY_SIMILARITY);

        long itemID = indexItemIDMap.get(itemIDIndex);
        for (SimilarItem similarItem : mostSimilarItems) {
            long otherItemID = similarItem.getItemID();
            if (itemID < otherItemID) {
                ctx.write(new EntityEntityWritable(itemID, otherItemID),
                        new DoubleWritable(similarItem.getSimilarity()));
            } else {
                ctx.write(new EntityEntityWritable(otherItemID, itemID),
                        new DoubleWritable(similarItem.getSimilarity()));
            }
        }
    }
}

From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.SimilarityReducer.java

License:Apache License

@Override
public void reduce(ItemPairWritable pair, Iterator<CoRating> coRatings,
        OutputCollector<EntityEntityWritable, DoubleWritable> output, Reporter reporter) throws IOException {

    double similarity = distributedItemSimilarity.similarity(coRatings, pair.getItemAWeight(),
            pair.getItemBWeight(), numberOfUsers);

    if (!Double.isNaN(similarity)) {
        output.collect(pair.getItemItemWritable(), new DoubleWritable(similarity));
    }//from   ww  w.ja  va 2 s.c om
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerMapper.java

License:Apache License

/**
 * We need to calculate the thetaNormalization factor of each label
 *
 * @param key   The label,feature pair/*  w  ww . j  a v  a2s. c  o m*/
 * @param value The tfIdf of the pair
 */
@Override
public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter)
        throws IOException {

    String labelFeaturePair = key.toString();
    double alpha_i = 1.0;

    String label = labelFeaturePair.split(",")[0];
    double weight = Math.log((value.get() + alpha_i) / (labelWeightSum.get(label) + vocabCount));
    output.collect(new Text(('_' + label).trim()), new DoubleWritable(weight));
}

From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    // Key is label,word, value is the number of times we've seen this label
    // word per local node. Output is the same

    //String token = key.toString();

    double weightSumPerLabel = 0.0;

    while (values.hasNext()) {
        weightSumPerLabel += values.next().get();
    }//  w  ww .ja  v  a2  s.  co m
    output.collect(key, new DoubleWritable(weightSumPerLabel));

}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureMapper.java

License:Apache License

/**
 * We need to count the number of times we've seen a term with a given label and we need to output that. But this
 * Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it outputs for each
 * unique word in a document value 1 for summing up as the Term Document Frequency. Which later is used to calculate
 * the Idf Thirdly, it outputs for each label the number of times a document was seen(Also used in Idf Calculation)
 *
 * @param key      The label//from   ww  w.j  a v  a2 s  .com
 * @param value    the features (all unique) associated w/ this label
 * @param output   The OutputCollector to write the results to
 * @param reporter Not used
 */
@Override
public void map(Text key, Text value, OutputCollector<Text, DoubleWritable> output, Reporter reporter)
        throws IOException {
    //String line = value.toString();
    String label = key.toString();
    int keyLen = label.length();

    Map<String, int[]> wordList = new HashMap<String, int[]>(1000);

    StringBuilder builder = new StringBuilder(label);
    builder.ensureCapacity(32);// make sure we have a reasonably size buffer to
    // begin with
    List<String> ngrams = Model.generateNGramsWithoutLabel(value.toString(), gramSize);
    for (String ngram : ngrams) {
        int[] count = wordList.get(ngram);
        if (count == null) {
            count = new int[1];
            count[0] = 0;
            wordList.put(ngram, count);
        }
        count[0]++;
    }
    double lengthNormalisation = 0.0;
    for (int[] D_kj : wordList.values()) {
        // key is label,word
        double dkjValue = (double) D_kj[0];
        lengthNormalisation += dkjValue * dkjValue;
    }
    lengthNormalisation = Math.sqrt(lengthNormalisation);

    // Output Length Normalized + TF Transformed Frequency per Word per Class
    // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) )
    for (Map.Entry<String, int[]> entry : wordList.entrySet()) {
        // key is label,word
        String token = entry.getKey();
        builder.append(',').append(token);
        labelWord.set(builder.toString());
        DoubleWritable f = new DoubleWritable(Math.log(1.0 + entry.getValue()[0]) / lengthNormalisation);
        output.collect(labelWord, f);
        builder.setLength(keyLen);// truncate back
    }

    // Output Document Frequency per Word per Class
    String dflabel = '-' + label;
    int dfKeyLen = dflabel.length();
    builder = new StringBuilder(dflabel);
    for (String token : wordList.keySet()) {
        // key is label,word
        builder.append(',').append(token);
        labelWord.set(builder.toString());
        output.collect(labelWord, one);
        output.collect(new Text(',' + token), one);
        builder.setLength(dfKeyLen);// truncate back

    }

    // output that we have seen the label to calculate the Count of Document per
    // class
    output.collect(new Text('_' + label), one);
}

From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same

    double sum = 0.0;
    while (values.hasNext()) {
        sum += values.next().get();/*from ww  w  .  j a  v a2 s . co m*/
    }
    output.collect(key, new DoubleWritable(sum));
}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java

License:Apache License

/**
 * We need to calculate the Tf-Idf of each feature in each label
 *
 * @param key The label,feature pair (can either be the freq Count or the term Document count
 *///from w w  w. ja  v  a  2  s  .  c  o m
@Override
public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter)
        throws IOException {

    String labelFeaturePair = key.toString();

    char firstChar = labelFeaturePair.charAt(0);
    switch (firstChar) {
    case '-': {// if it is the termDocumentCount
        labelFeaturePair = labelFeaturePair.substring(1);
        //-17th_century_mathematicians_anderson__alexander,1582
        int idx = labelFeaturePair.indexOf(",");
        if (idx != -1) {
            String label = labelFeaturePair.substring(0, idx);

            Double labelDocumentCount = labelDocumentCounts.get(label);
            if (labelDocumentCount == null) {
                throw new IOException("Invalid label: " + label);
            }
            double logIdf = Math.log(labelDocumentCount / value.get());
            output.collect(new Text(labelFeaturePair), new DoubleWritable(logIdf));
        } else {
            throw new IOException("Invalid ");
        }
        break;
    }
    case ',': {
        output.collect(new Text("*vocabCount"), new DoubleWritable(1.0));
        break;
    }
    default: {
        output.collect(key, value);
        break;
    }
    }
}

From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output,
        Reporter reporter) throws IOException {
    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
    String token = key.toString();
    if (token.startsWith("*vocabCount")) {
        double vocabCount = 0.0;
        while (values.hasNext()) {
            vocabCount += values.next().get();
        }//from ww  w.j  a  v  a2 s  .  c  o  m
        log.info("{}\t{}", token, vocabCount);
        output.collect(key, new DoubleWritable(vocabCount));
    } else {
        double idfTimes_D_ij = 1.0;
        //int numberofValues = 0;
        while (values.hasNext()) {
            idfTimes_D_ij *= values.next().get();
            //numberofValues ++;
        }
        //if(numberofValues!=2) throw new IOException("Number of values should be exactly 2");

        output.collect(key, new DoubleWritable(idfTimes_D_ij));
    }
}