List of usage examples for org.apache.lucene.search.similarities ClassicSimilarity idf
@Override public float idf(long docFreq, long docCount)
log((docCount+1)/(docFreq+1)) + 1. From source file:ai.castor.idf.FetchTermIDF.java
License:Apache License
public double getTermIDF(String term) throws ParseException { Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String esTerm = qp.escape(term); double termIDF = 0.0; try {/*from ww w . j a v a 2 s. c o m*/ TermQuery q = (TermQuery) qp.parse(esTerm); Term t = q.getTerm(); termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF); } catch (Exception e) { System.err.println("Exception in fetching IDF(" + term + "): " + e.toString()); } return termIDF; }
From source file:ai.castor.idf.IDFScorer.java
License:Apache License
public double calcIDF(String query, String answer, boolean analyze) throws ParseException { Analyzer analyzer;/* www. ja v a2 s .c o m*/ if (analyze) { analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); } else { analyzer = new WhitespaceAnalyzer(); } QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+"))); double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = answer.split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += similarity.idf(reader.docFreq(t), reader.numDocs()); seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } return idf; }
From source file:com.o19s.es.explore.ExplorerQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { if (!needsScores) { return searcher.createWeight(query, false, boost); }//w ww.j a v a 2s . co m final Weight subWeight = searcher.createWeight(query, true, boost); Set<Term> terms = new HashSet<>(); subWeight.extractTerms(terms); if (isCollectionScoped()) { ClassicSimilarity sim = new ClassicSimilarity(); StatisticsHelper df_stats = new StatisticsHelper(); StatisticsHelper idf_stats = new StatisticsHelper(); StatisticsHelper ttf_stats = new StatisticsHelper(); for (Term term : terms) { TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term); TermStatistics tStats = searcher.termStatistics(term, ctx); df_stats.add(tStats.docFreq()); idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs())); ttf_stats.add(tStats.totalTermFreq()); } /* If no terms are parsed in the query we opt for returning 0 instead of throwing an exception that could break various pipelines. */ float constantScore; if (terms.size() > 0) { switch (type) { case ("sum_classic_idf"): constantScore = idf_stats.getSum(); break; case ("mean_classic_idf"): constantScore = idf_stats.getMean(); break; case ("max_classic_idf"): constantScore = idf_stats.getMax(); break; case ("min_classic_idf"): constantScore = idf_stats.getMin(); break; case ("stddev_classic_idf"): constantScore = idf_stats.getStdDev(); break; case "sum_raw_df": constantScore = df_stats.getSum(); break; case "min_raw_df": constantScore = df_stats.getMin(); break; case "max_raw_df": constantScore = df_stats.getMax(); break; case "mean_raw_df": constantScore = df_stats.getMean(); break; case "stddev_raw_df": constantScore = df_stats.getStdDev(); break; case "sum_raw_ttf": constantScore = ttf_stats.getSum(); break; case "min_raw_ttf": constantScore = ttf_stats.getMin(); break; case "max_raw_ttf": constantScore = ttf_stats.getMax(); break; case "mean_raw_ttf": constantScore = ttf_stats.getMean(); break; case "stddev_raw_ttf": constantScore = ttf_stats.getStdDev(); break; case "unique_terms_count": constantScore = terms.size(); break; default: throw new RuntimeException("Invalid stat type specified."); } } else { constantScore = 0.0f; } return new ConstantScoreWeight(ExplorerQuery.this, constantScore) { @Override public Explanation explain(LeafReaderContext context, int doc) throws IOException { Scorer scorer = scorer(context); int newDoc = scorer.iterator().advance(doc); assert newDoc == doc; // this is a DocIdSetIterator.all return Explanation.match(scorer.score(), "Stat Score: " + type); } @Override public Scorer scorer(LeafReaderContext context) throws IOException { return new ConstantScoreScorer(this, constantScore, DocIdSetIterator.all(context.reader().maxDoc())); } @Override public boolean isCacheable(LeafReaderContext ctx) { return true; } }; } else if (type.endsWith("_raw_tf")) { // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery BooleanQuery.Builder qb = new BooleanQuery.Builder(); for (Term t : terms) { qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF), BooleanClause.Occur.SHOULD)); } // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms // problem is that we rely on extractTerms which happen too late in the process Query q = qb.build().rewrite(searcher.getIndexReader()); return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type); } throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]"); }
From source file:indexer.Retriever.java
private String getIDF(IndexReader reader, String word) throws IOException { ClassicSimilarity similarity = new ClassicSimilarity(); int documentsFreq = 0; float idf = 0; Term term = new Term(documentField, word); int _documentsFreq = reader.docFreq(term); int documentsCount = reader.getDocCount(documentField); idf += similarity.idf(_documentsFreq, documentsCount); documentsFreq += _documentsFreq;// ww w.j a va2s .c om String printString = word + ": " + idf + " (in " + documentsFreq + " documents)"; return printString; }
From source file:io.anserini.qa.passage.IdfPassageScorer.java
License:Apache License
@Override public void score(String query, Map<String, Float> sentences) throws Exception { // EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>( Arrays.asList(question.toString().trim().toLowerCase().split("\\s+"))); // add the question terms to the termIDF Map for (String questionTerm : questionTerms) { try {/*from w w w . j a v a 2 s. c om*/ TermQuery q = (TermQuery) qp.parse(questionTerm); Term t = q.getTerm(); double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(questionTerm, String.valueOf(termIDF)); } catch (Exception e) { continue; } } // avoid duplicate passages HashSet<String> seenSentences = new HashSet<>(); for (Map.Entry<String, Float> sent : sentences.entrySet()) { double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = sent.getKey().toLowerCase().split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(term, String.valueOf(termIDF)); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += termIDF; seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } double weightedScore = idf + 0.0001 * sent.getValue(); ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue()); if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) { if (scoredPassageHeap.size() == topPassages) { scoredPassageHeap.pollLast(); } scoredPassageHeap.add(scoredPassage); seenSentences.add(sent.getKey()); } } }
From source file:io.anserini.qa.passage.IdfPassageScorer.java
License:Apache License
@Override public JSONObject getTermIdfJSON(List<String> sentList) { // EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); ClassicSimilarity similarity = new ClassicSimilarity(); for (String sent : sentList) { String[] thisSentence = sent.trim().split("\\s+"); for (String term : thisSentence) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm();//from ww w.j a v a2 s . c o m double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(term, String.valueOf(termIDF)); } catch (Exception e) { continue; } } } return new JSONObject(termIdfMap); }
From source file:pretraga.IsolationSimilarity.java
public void test(String vec) { List<String> vector = processInput(vec); HashMap<String, Long> map = new HashMap<>(); try {/*from w w w .ja va2 s . c o m*/ Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); List<Integer> docId = getDocumentsFromVector(vector, reader, searcher); for (int i = 0; i < docId.size(); i++) { Fields ff = reader.getTermVectors(docId.get(i)); Terms terms = ff.terms(CONTENT); TermsEnum te = terms.iterator(); Object tmp = te.next(); while (tmp != null) { BytesRef by = (BytesRef) tmp; String term = by.utf8ToString(); ClassicSimilarity sim = null; if (searcher.getSimilarity(true) instanceof ClassicSimilarity) { sim = (ClassicSimilarity) searcher.getSimilarity(true); } float idf = sim.idf(te.docFreq(), reader.maxDoc()); float tf = sim.tf(te.totalTermFreq()); //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq()); TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq()); CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(), terms.getSumTotalTermFreq(), terms.getSumDocFreq()); Document d = reader.document(docId.get(i)); if (vector.contains(term)) { float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0)) .score(docId.get(i), te.totalTermFreq()); System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term); } tmp = te.next(); } /*Iterator<String> ss = ff.iterator(); while (ss.hasNext()) { String fieldString = ss.next(); System.out.println(fieldString); }*/ } } catch (Exception e) { } }