List of usage examples for org.apache.mahout.common.distance CosineDistanceMeasure distance
@Override
public double distance(Vector v1, Vector v2)
From source file:com.gsvic.csmr.CosineSimilarityReducer.java
License:Apache License
@Override public void reduce(Text key, Iterable<VectorArrayWritable> value, Context context) throws IOException, InterruptedException { CosineDistanceMeasure cdm = new CosineDistanceMeasure(); VectorWritable docX, docY;//from ww w . ja v a 2 s. c om double cosine; for (VectorArrayWritable v : value) { docX = (VectorWritable) v.get()[0]; docY = (VectorWritable) v.get()[1]; cosine = cdm.distance(docX.get(), docY.get()); context.write(key, new DoubleWritable(cosine)); } }
From source file:crawler.CETReCrawler.java
License:Apache License
private String[] distFilter() { DataSearch keywordsFilter = new DataSearch(TFFreq.keySet().toArray(new String[TFFreq.size()])); //pass potential keywords keywordsFilter.collectNewTweets();// w ww. j ava2 s. c o m keywordsFilter.calculateTFIDF(); ArrayList<String> keywordsFreqName = new ArrayList<String>();// ArrayList<Vector> keywordsFreq = new ArrayList<Vector>(); //list of keywords vector ArrayList<String> keywords = new ArrayList<String>(); //list of keywords for (String basehash : Settings.baseKeywords) { keywordsFreqName.add(basehash);// keywordsFreq.add(keywordsFilter.getSeedVect(basehash)); keywords.add(basehash); } int count = 0; while (!keywordsFreq.isEmpty()) { String seq1Name = keywordsFreqName.get(0);// Vector seq1 = keywordsFreq.get(0); Vector seq2; Iterator<Entry<String, Integer>> iter = TFFreq.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Integer> ent = iter.next(); String hashtag = ent.getKey(); seq2 = keywordsFilter.getSeedVect(hashtag); //System.out.print("seq 1 ["+seq1Name+"] count ["+TFFreq.get(seq1Name)+"]: ");// //System.out.println(seq1.asFormatString()); //System.out.print("seq 2 ["+hashtag+"] count ["+TFFreq.get(hashtag)+"]: "); if (seq1 != null && seq2 != null) { //System.out.println(seq2.asFormatString()); CosineDistanceMeasure cos = new CosineDistanceMeasure(); double distVal = cos.distance(seq1, seq2); //System.out.println("***********["+hashtag+"] v.s ["+seq1Name+"]: "+distVal+"***********"); //check whether one of the two words is baseline criteria boolean inBase = false; for (String basehash : Settings.baseKeywords) { if (hashtag.replace("#", "").equals(basehash.toLowerCase()) || hashtag.equals(basehash) || keywords.get(count).replace("#", "").equals(basehash.toLowerCase()) || keywords.get(count).equals(basehash)) { inBase = true; break; } } //base 0.8, others 0.5 if (inBase) { if (distVal < 0.8 && distVal > 0.00001 && !keywords.contains(hashtag)) { keywordsFreqName.add(hashtag);// keywordsFreq.add(seq2); keywords.add(hashtag); System.out.println("BASEL***********[" + hashtag + "] v.s [" + seq1Name + "]: " + distVal + "***********"); } } else { if (distVal < 0.5 && distVal > 0.00001 && !keywords.contains(hashtag)) { keywordsFreqName.add(hashtag);// keywordsFreq.add(seq2); keywords.add(hashtag); System.out.println("OTHER***********[" + hashtag + "] v.s [" + seq1Name + "]: " + distVal + "***********"); } } } } keywordsFreqName.remove(0);// keywordsFreq.remove(0); count++; } return keywords.toArray(new String[keywords.size()]); }