List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable
public DoubleWritable(double value)
From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityTest.java
License:Apache License
/** * tests {@link MostSimilarItemPairsMapper} */// www .j a v a 2s. co m public void testMostSimilarItemsPairsMapper() throws Exception { OpenIntLongHashMap indexItemIDMap = new OpenIntLongHashMap(); indexItemIDMap.put(12, 12L); indexItemIDMap.put(34, 34L); indexItemIDMap.put(56, 56L); Mapper<IntWritable, VectorWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock .createMock(Mapper.Context.class); context.write(new EntityEntityWritable(34L, 56L), new DoubleWritable(0.9)); EasyMock.replay(context); Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE); vector.set(12, 0.2); vector.set(34, 1.0); vector.set(56, 0.9); MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper(); setField(mapper, "indexItemIDMap", indexItemIDMap); setField(mapper, "maxSimilarItemsPerItem", 1); mapper.map(new IntWritable(34), new VectorWritable(vector), context); EasyMock.verify(context); }
From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityTest.java
License:Apache License
/** * tests {@link MostSimilarItemPairsReducer} */// www. j ava 2 s .co m public void testMostSimilarItemPairsReducer() throws Exception { Reducer<EntityEntityWritable, DoubleWritable, EntityEntityWritable, DoubleWritable>.Context context = EasyMock .createMock(Reducer.Context.class); context.write(new EntityEntityWritable(123L, 456L), new DoubleWritable(0.5)); EasyMock.replay(context); new MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L), Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)), context); EasyMock.verify(context); }
From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.MostSimilarItemPairsMapper.java
License:Apache License
@Override protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx) throws IOException, InterruptedException { int itemIDIndex = itemIDIndexWritable.get(); Queue<SimilarItem> topMostSimilarItems = new PriorityQueue<SimilarItem>(maxSimilarItemsPerItem + 1, Collections.reverseOrder(SimilarItem.COMPARE_BY_SIMILARITY)); Iterator<Vector.Element> similarityVectorIterator = similarityVector.get().iterateNonZero(); while (similarityVectorIterator.hasNext()) { Vector.Element element = similarityVectorIterator.next(); int index = element.index(); double value = element.get(); /* ignore self similarities */ if (index != itemIDIndex) { if (topMostSimilarItems.size() < maxSimilarItemsPerItem) { topMostSimilarItems.add(new SimilarItem(indexItemIDMap.get(index), value)); } else if (value > topMostSimilarItems.peek().getSimilarity()) { topMostSimilarItems.add(new SimilarItem(indexItemIDMap.get(index), value)); topMostSimilarItems.poll(); }//from w w w. j a v a 2 s.co m } } if (!topMostSimilarItems.isEmpty()) { List<SimilarItem> mostSimilarItems = new ArrayList<SimilarItem>(topMostSimilarItems.size()); mostSimilarItems.addAll(topMostSimilarItems); Collections.sort(mostSimilarItems, SimilarItem.COMPARE_BY_SIMILARITY); long itemID = indexItemIDMap.get(itemIDIndex); for (SimilarItem similarItem : mostSimilarItems) { long otherItemID = similarItem.getItemID(); if (itemID < otherItemID) { ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity())); } else { ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity())); } } } }
From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.SimilarityReducer.java
License:Apache License
@Override public void reduce(ItemPairWritable pair, Iterator<CoRating> coRatings, OutputCollector<EntityEntityWritable, DoubleWritable> output, Reporter reporter) throws IOException { double similarity = distributedItemSimilarity.similarity(coRatings, pair.getItemAWeight(), pair.getItemBWeight(), numberOfUsers); if (!Double.isNaN(similarity)) { output.collect(pair.getItemItemWritable(), new DoubleWritable(similarity)); }//from ww w.ja va 2 s.c om }
From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerMapper.java
License:Apache License
/** * We need to calculate the thetaNormalization factor of each label * * @param key The label,feature pair/* w ww . j a v a2s. c o m*/ * @param value The tfIdf of the pair */ @Override public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { String labelFeaturePair = key.toString(); double alpha_i = 1.0; String label = labelFeaturePair.split(",")[0]; double weight = Math.log((value.get() + alpha_i) / (labelWeightSum.get(label) + vocabCount)); output.collect(new Text(('_' + label).trim()), new DoubleWritable(weight)); }
From source file:org.apache.mahout.classifier.bayes.BayesThetaNormalizerReducer.java
License:Apache License
@Override public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { // Key is label,word, value is the number of times we've seen this label // word per local node. Output is the same //String token = key.toString(); double weightSumPerLabel = 0.0; while (values.hasNext()) { weightSumPerLabel += values.next().get(); }// w ww .ja v a2 s. co m output.collect(key, new DoubleWritable(weightSumPerLabel)); }
From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureMapper.java
License:Apache License
/** * We need to count the number of times we've seen a term with a given label and we need to output that. But this * Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it outputs for each * unique word in a document value 1 for summing up as the Term Document Frequency. Which later is used to calculate * the Idf Thirdly, it outputs for each label the number of times a document was seen(Also used in Idf Calculation) * * @param key The label//from ww w.j a v a2 s .com * @param value the features (all unique) associated w/ this label * @param output The OutputCollector to write the results to * @param reporter Not used */ @Override public void map(Text key, Text value, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { //String line = value.toString(); String label = key.toString(); int keyLen = label.length(); Map<String, int[]> wordList = new HashMap<String, int[]>(1000); StringBuilder builder = new StringBuilder(label); builder.ensureCapacity(32);// make sure we have a reasonably size buffer to // begin with List<String> ngrams = Model.generateNGramsWithoutLabel(value.toString(), gramSize); for (String ngram : ngrams) { int[] count = wordList.get(ngram); if (count == null) { count = new int[1]; count[0] = 0; wordList.put(ngram, count); } count[0]++; } double lengthNormalisation = 0.0; for (int[] D_kj : wordList.values()) { // key is label,word double dkjValue = (double) D_kj[0]; lengthNormalisation += dkjValue * dkjValue; } lengthNormalisation = Math.sqrt(lengthNormalisation); // Output Length Normalized + TF Transformed Frequency per Word per Class // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) ) for (Map.Entry<String, int[]> entry : wordList.entrySet()) { // key is label,word String token = entry.getKey(); builder.append(',').append(token); labelWord.set(builder.toString()); DoubleWritable f = new DoubleWritable(Math.log(1.0 + entry.getValue()[0]) / lengthNormalisation); output.collect(labelWord, f); builder.setLength(keyLen);// truncate back } // Output Document Frequency per Word per Class String dflabel = '-' + label; int dfKeyLen = dflabel.length(); builder = new StringBuilder(dflabel); for (String token : wordList.keySet()) { // key is label,word builder.append(',').append(token); labelWord.set(builder.toString()); output.collect(labelWord, one); output.collect(new Text(',' + token), one); builder.setLength(dfKeyLen);// truncate back } // output that we have seen the label to calculate the Count of Document per // class output.collect(new Text('_' + label), one); }
From source file:org.apache.mahout.classifier.bayes.common.BayesFeatureReducer.java
License:Apache License
@Override public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { //Key is label,word, value is the number of times we've seen this label word per local node. Output is the same double sum = 0.0; while (values.hasNext()) { sum += values.next().get();/*from ww w . j a v a2 s . co m*/ } output.collect(key, new DoubleWritable(sum)); }
From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfMapper.java
License:Apache License
/** * We need to calculate the Tf-Idf of each feature in each label * * @param key The label,feature pair (can either be the freq Count or the term Document count *///from w w w. ja v a 2 s . c o m @Override public void map(Text key, DoubleWritable value, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { String labelFeaturePair = key.toString(); char firstChar = labelFeaturePair.charAt(0); switch (firstChar) { case '-': {// if it is the termDocumentCount labelFeaturePair = labelFeaturePair.substring(1); //-17th_century_mathematicians_anderson__alexander,1582 int idx = labelFeaturePair.indexOf(","); if (idx != -1) { String label = labelFeaturePair.substring(0, idx); Double labelDocumentCount = labelDocumentCounts.get(label); if (labelDocumentCount == null) { throw new IOException("Invalid label: " + label); } double logIdf = Math.log(labelDocumentCount / value.get()); output.collect(new Text(labelFeaturePair), new DoubleWritable(logIdf)); } else { throw new IOException("Invalid "); } break; } case ',': { output.collect(new Text("*vocabCount"), new DoubleWritable(1.0)); break; } default: { output.collect(key, value); break; } } }
From source file:org.apache.mahout.classifier.bayes.common.BayesTfIdfReducer.java
License:Apache License
@Override public void reduce(Text key, Iterator<DoubleWritable> values, OutputCollector<Text, DoubleWritable> output, Reporter reporter) throws IOException { //Key is label,word, value is the number of times we've seen this label word per local node. Output is the same String token = key.toString(); if (token.startsWith("*vocabCount")) { double vocabCount = 0.0; while (values.hasNext()) { vocabCount += values.next().get(); }//from ww w.j a v a2 s . c o m log.info("{}\t{}", token, vocabCount); output.collect(key, new DoubleWritable(vocabCount)); } else { double idfTimes_D_ij = 1.0; //int numberofValues = 0; while (values.hasNext()) { idfTimes_D_ij *= values.next().get(); //numberofValues ++; } //if(numberofValues!=2) throw new IOException("Number of values should be exactly 2"); output.collect(key, new DoubleWritable(idfTimes_D_ij)); } }