Example usage for org.apache.lucene.search.similarities ClassicSimilarity idf

List of usage examples for org.apache.lucene.search.similarities ClassicSimilarity idf

Introduction

In this page you can find the example usage for org.apache.lucene.search.similarities ClassicSimilarity idf.

Prototype

@Override
public float idf(long docFreq, long docCount) 

Source Link

Document

Implemented as log((docCount+1)/(docFreq+1)) + 1.

Usage

From source file:ai.castor.idf.FetchTermIDF.java

License:Apache License

public double getTermIDF(String term) throws ParseException {
    Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String esTerm = qp.escape(term);
    double termIDF = 0.0;
    try {/*from  ww w . j  a v a  2  s.  c o m*/
        TermQuery q = (TermQuery) qp.parse(esTerm);
        Term t = q.getTerm();
        termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());

        System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF);
    } catch (Exception e) {
        System.err.println("Exception in fetching IDF(" + term + "): " + e.toString());
    }
    return termIDF;
}

From source file:ai.castor.idf.IDFScorer.java

License:Apache License

public double calcIDF(String query, String answer, boolean analyze) throws ParseException {
    Analyzer analyzer;/*  www. ja  v  a2 s .c  o m*/
    if (analyze) {
        analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    } else {
        analyzer = new WhitespaceAnalyzer();
    }

    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+")));

    double idf = 0.0;
    HashSet<String> seenTerms = new HashSet<>();

    String[] terms = answer.split("\\s+");
    for (String term : terms) {
        try {
            TermQuery q = (TermQuery) qp.parse(term);
            Term t = q.getTerm();

            if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                idf += similarity.idf(reader.docFreq(t), reader.numDocs());
                seenTerms.add(t.toString());
            } else {
                idf += 0.0;
            }
        } catch (Exception e) {
            continue;
        }
    }
    return idf;
}

From source file:com.o19s.es.explore.ExplorerQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    if (!needsScores) {
        return searcher.createWeight(query, false, boost);
    }//w ww.j a v a  2s  .  co m
    final Weight subWeight = searcher.createWeight(query, true, boost);
    Set<Term> terms = new HashSet<>();
    subWeight.extractTerms(terms);
    if (isCollectionScoped()) {
        ClassicSimilarity sim = new ClassicSimilarity();
        StatisticsHelper df_stats = new StatisticsHelper();
        StatisticsHelper idf_stats = new StatisticsHelper();
        StatisticsHelper ttf_stats = new StatisticsHelper();

        for (Term term : terms) {
            TermContext ctx = TermContext.build(searcher.getTopReaderContext(), term);
            TermStatistics tStats = searcher.termStatistics(term, ctx);
            df_stats.add(tStats.docFreq());
            idf_stats.add(sim.idf(tStats.docFreq(), searcher.getIndexReader().numDocs()));
            ttf_stats.add(tStats.totalTermFreq());
        }

        /*
        If no terms are parsed in the query we opt for returning 0
        instead of throwing an exception that could break various
        pipelines.
         */
        float constantScore;

        if (terms.size() > 0) {
            switch (type) {
            case ("sum_classic_idf"):
                constantScore = idf_stats.getSum();
                break;
            case ("mean_classic_idf"):
                constantScore = idf_stats.getMean();
                break;
            case ("max_classic_idf"):
                constantScore = idf_stats.getMax();
                break;
            case ("min_classic_idf"):
                constantScore = idf_stats.getMin();
                break;
            case ("stddev_classic_idf"):
                constantScore = idf_stats.getStdDev();
                break;
            case "sum_raw_df":
                constantScore = df_stats.getSum();
                break;
            case "min_raw_df":
                constantScore = df_stats.getMin();
                break;
            case "max_raw_df":
                constantScore = df_stats.getMax();
                break;
            case "mean_raw_df":
                constantScore = df_stats.getMean();
                break;
            case "stddev_raw_df":
                constantScore = df_stats.getStdDev();
                break;
            case "sum_raw_ttf":
                constantScore = ttf_stats.getSum();
                break;
            case "min_raw_ttf":
                constantScore = ttf_stats.getMin();
                break;
            case "max_raw_ttf":
                constantScore = ttf_stats.getMax();
                break;
            case "mean_raw_ttf":
                constantScore = ttf_stats.getMean();
                break;
            case "stddev_raw_ttf":
                constantScore = ttf_stats.getStdDev();
                break;
            case "unique_terms_count":
                constantScore = terms.size();
                break;

            default:
                throw new RuntimeException("Invalid stat type specified.");
            }
        } else {
            constantScore = 0.0f;
        }

        return new ConstantScoreWeight(ExplorerQuery.this, constantScore) {

            @Override
            public Explanation explain(LeafReaderContext context, int doc) throws IOException {
                Scorer scorer = scorer(context);
                int newDoc = scorer.iterator().advance(doc);
                assert newDoc == doc; // this is a DocIdSetIterator.all
                return Explanation.match(scorer.score(), "Stat Score: " + type);
            }

            @Override
            public Scorer scorer(LeafReaderContext context) throws IOException {
                return new ConstantScoreScorer(this, constantScore,
                        DocIdSetIterator.all(context.reader().maxDoc()));
            }

            @Override
            public boolean isCacheable(LeafReaderContext ctx) {
                return true;
            }

        };
    } else if (type.endsWith("_raw_tf")) {
        // Rewrite this into a boolean query where we can inject our PostingsExplorerQuery
        BooleanQuery.Builder qb = new BooleanQuery.Builder();
        for (Term t : terms) {
            qb.add(new BooleanClause(new PostingsExplorerQuery(t, PostingsExplorerQuery.Type.TF),
                    BooleanClause.Occur.SHOULD));
        }
        // FIXME: completely refactor this class and stop accepting a random query but a list of terms directly
        // rewriting at this point is wrong, additionally we certainly build the TermContext twice for every terms
        // problem is that we rely on extractTerms which happen too late in the process
        Query q = qb.build().rewrite(searcher.getIndexReader());
        return new ExplorerQuery.ExplorerWeight(this, searcher.createWeight(q, true, boost), type);
    }
    throw new IllegalArgumentException("Unknown ExplorerQuery type [" + type + "]");
}

From source file:indexer.Retriever.java

private String getIDF(IndexReader reader, String word) throws IOException {
    ClassicSimilarity similarity = new ClassicSimilarity();
    int documentsFreq = 0;
    float idf = 0;

    Term term = new Term(documentField, word);
    int _documentsFreq = reader.docFreq(term);
    int documentsCount = reader.getDocCount(documentField);
    idf += similarity.idf(_documentsFreq, documentsCount);
    documentsFreq += _documentsFreq;//  ww w.j  a  va2s .c om

    String printString = word + ": " + idf + " (in " + documentsFreq + " documents)";
    return printString;
}

From source file:io.anserini.qa.passage.IdfPassageScorer.java

License:Apache License

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(
            Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));

    // add the question terms to the termIDF Map
    for (String questionTerm : questionTerms) {
        try {/*from   w w w . j a v  a 2 s. c  om*/
            TermQuery q = (TermQuery) qp.parse(questionTerm);
            Term t = q.getTerm();

            double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
            termIdfMap.put(questionTerm, String.valueOf(termIDF));
        } catch (Exception e) {
            continue;
        }
    }

    // avoid duplicate passages
    HashSet<String> seenSentences = new HashSet<>();

    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double idf = 0.0;
        HashSet<String> seenTerms = new HashSet<>();

        String[] terms = sent.getKey().toLowerCase().split("\\s+");
        for (String term : terms) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));

                if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                    idf += termIDF;
                    seenTerms.add(t.toString());
                } else {
                    idf += 0.0;
                }
            } catch (Exception e) {
                continue;
            }
        }

        double weightedScore = idf + 0.0001 * sent.getValue();
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore())
                && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}

From source file:io.anserini.qa.passage.IdfPassageScorer.java

License:Apache License

@Override
public JSONObject getTermIdfJSON(List<String> sentList) {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();

    for (String sent : sentList) {
        String[] thisSentence = sent.trim().split("\\s+");

        for (String term : thisSentence) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();//from ww w.j a  v  a2  s . c o m

                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));
            } catch (Exception e) {
                continue;
            }
        }
    }
    return new JSONObject(termIdfMap);
}

From source file:pretraga.IsolationSimilarity.java

public void test(String vec) {
    List<String> vector = processInput(vec);
    HashMap<String, Long> map = new HashMap<>();
    try {/*from  w  w  w .ja  va2 s . c  o  m*/
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        List<Integer> docId = getDocumentsFromVector(vector, reader, searcher);

        for (int i = 0; i < docId.size(); i++) {
            Fields ff = reader.getTermVectors(docId.get(i));
            Terms terms = ff.terms(CONTENT);

            TermsEnum te = terms.iterator();
            Object tmp = te.next();
            while (tmp != null) {
                BytesRef by = (BytesRef) tmp;
                String term = by.utf8ToString();

                ClassicSimilarity sim = null;
                if (searcher.getSimilarity(true) instanceof ClassicSimilarity) {
                    sim = (ClassicSimilarity) searcher.getSimilarity(true);
                }
                float idf = sim.idf(te.docFreq(), reader.maxDoc());
                float tf = sim.tf(te.totalTermFreq());
                //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq());
                TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq());
                CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(),
                        terms.getSumTotalTermFreq(), terms.getSumDocFreq());
                Document d = reader.document(docId.get(i));
                if (vector.contains(term)) {
                    float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0))
                            .score(docId.get(i), te.totalTermFreq());
                    System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term);
                }
                tmp = te.next();
            }

            /*Iterator<String> ss = ff.iterator();
            while (ss.hasNext()) {
            String fieldString = ss.next();
            System.out.println(fieldString);
            }*/
        }
    } catch (Exception e) {

    }
}