Example usage for org.apache.mahout.common.distance CosineDistanceMeasure distance

List of usage examples for org.apache.mahout.common.distance CosineDistanceMeasure distance

Introduction

In this page you can find the example usage for org.apache.mahout.common.distance CosineDistanceMeasure distance.

Prototype

@Override
    public double distance(Vector v1, Vector v2) 

Source Link

Usage

From source file:com.gsvic.csmr.CosineSimilarityReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterable<VectorArrayWritable> value, Context context)
        throws IOException, InterruptedException {

    CosineDistanceMeasure cdm = new CosineDistanceMeasure();
    VectorWritable docX, docY;//from  ww w . ja v  a  2  s. c om
    double cosine;
    for (VectorArrayWritable v : value) {
        docX = (VectorWritable) v.get()[0];
        docY = (VectorWritable) v.get()[1];
        cosine = cdm.distance(docX.get(), docY.get());
        context.write(key, new DoubleWritable(cosine));
    }

}

From source file:crawler.CETReCrawler.java

License:Apache License

private String[] distFilter() {
     DataSearch keywordsFilter = new DataSearch(TFFreq.keySet().toArray(new String[TFFreq.size()])); //pass potential keywords
     keywordsFilter.collectNewTweets();// w ww.  j  ava2  s.  c  o m
     keywordsFilter.calculateTFIDF();

     ArrayList<String> keywordsFreqName = new ArrayList<String>();//
     ArrayList<Vector> keywordsFreq = new ArrayList<Vector>(); //list of keywords vector
     ArrayList<String> keywords = new ArrayList<String>(); //list of keywords

     for (String basehash : Settings.baseKeywords) {
         keywordsFreqName.add(basehash);//
         keywordsFreq.add(keywordsFilter.getSeedVect(basehash));
         keywords.add(basehash);
     }

     int count = 0;
     while (!keywordsFreq.isEmpty()) {
         String seq1Name = keywordsFreqName.get(0);//
         Vector seq1 = keywordsFreq.get(0);
         Vector seq2;
         Iterator<Entry<String, Integer>> iter = TFFreq.entrySet().iterator();
         while (iter.hasNext()) {
             Entry<String, Integer> ent = iter.next();
             String hashtag = ent.getKey();
             seq2 = keywordsFilter.getSeedVect(hashtag);

             //System.out.print("seq 1 ["+seq1Name+"] count ["+TFFreq.get(seq1Name)+"]: ");//
             //System.out.println(seq1.asFormatString());
             //System.out.print("seq 2 ["+hashtag+"] count ["+TFFreq.get(hashtag)+"]: ");
             if (seq1 != null && seq2 != null) {
                 //System.out.println(seq2.asFormatString());
                 CosineDistanceMeasure cos = new CosineDistanceMeasure();
                 double distVal = cos.distance(seq1, seq2);
                 //System.out.println("***********["+hashtag+"] v.s ["+seq1Name+"]: "+distVal+"***********");

                 //check whether one of the two words is baseline criteria
                 boolean inBase = false;
                 for (String basehash : Settings.baseKeywords) {
                     if (hashtag.replace("#", "").equals(basehash.toLowerCase()) || hashtag.equals(basehash)
                             || keywords.get(count).replace("#", "").equals(basehash.toLowerCase())
                             || keywords.get(count).equals(basehash)) {
                         inBase = true;
                         break;
                     }
                 }

                 //base 0.8, others 0.5
                 if (inBase) {
                     if (distVal < 0.8 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                         keywordsFreqName.add(hashtag);//
                         keywordsFreq.add(seq2);
                         keywords.add(hashtag);
                         System.out.println("BASEL***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                 + distVal + "***********");
                     }
                 } else {
                     if (distVal < 0.5 && distVal > 0.00001 && !keywords.contains(hashtag)) {
                         keywordsFreqName.add(hashtag);//
                         keywordsFreq.add(seq2);
                         keywords.add(hashtag);
                         System.out.println("OTHER***********[" + hashtag + "] v.s [" + seq1Name + "]: "
                                 + distVal + "***********");
                     }
                 }
             }
         }
         keywordsFreqName.remove(0);//
         keywordsFreq.remove(0);
         count++;
     }

     return keywords.toArray(new String[keywords.size()]);
 }