Example usage for org.apache.lucene.index IndexReader getSumTotalTermFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getSumTotalTermFreq.

Prototype

public abstract long getSumTotalTermFreq(String field) throws IOException;

Source Link

Document

Returns the sum of TermsEnum#totalTermFreq for all terms in this field.

Usage

From source file:edu.umd.umiacs.clip.tools.scor.BM25Scorer.java

License:Apache License

public BM25Scorer(IndexReader ir, String field) {
    super(ir, field);
    k1 = 1.2f;// ww w.  ja v  a2s  .c  o m
    b = 0.75f;
    try {
        avgdl = ir.getSumTotalTermFreq(field) / (float) ir.numDocs();
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    cache = new float[(int) (avgdl * 10)];
    for (int i = 0; i < cache.length; i++) {
        cache[i] = k1 * (1 - b + b * (i / avgdl));
    }
}

From source file:feedback.RelevanceModelIId.java

public float getQueryClarity(IndexReader reader) throws Exception {
    float klDiv = 0;
    float p_w_C;//from w  w w  .  j  a va 2s .com
    // For each v \in V (vocab of top ranked documents)
    for (Map.Entry<String, RetrievedDocTermInfo> e : retrievedDocsTermStats.termStats.entrySet()) {
        RetrievedDocTermInfo w = e.getValue();
        double sumCf = (double) reader.getSumTotalTermFreq(TrecDocIndexer.FIELD_ANALYZED_CONTENT);
        double cf = reader.totalTermFreq(new Term(TrecDocIndexer.FIELD_ANALYZED_CONTENT, w.wvec.getWord()));
        p_w_C = (float) (cf / sumCf);
        klDiv += w.wt * Math.log(w.wt / p_w_C);
    }
    return klDiv;
}

From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java

License:Apache License

public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
    BytesRef br = termtext;/*from   ww  w . j av  a  2  s .c  o m*/
    long totalTF = 0;
    try {
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        totalTF = reader.getSumTotalTermFreq(field);
        return totalTF;
    } catch (Exception e) {
        return 0;
    }
}

From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java

public void iterateOverQueryTerms() throws ParseException, Exception {
    long start = System.currentTimeMillis();
    int l = 0;/*from ww w. j ava 2s. c o  m*/
    //        System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t"
    //                + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims");

    System.out.println(
            "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims");
    for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) {
        l++;
        String queryid = e.getKey();
        PatentDocument pt = e.getValue();
        //            System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": ");
        long start2 = System.currentTimeMillis();
        PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords);
        BooleanQuery bQuery = (BooleanQuery) query.parse();
        if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery)
                || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0
                || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                        .getQuery() instanceof BooleanQuery)) {
            continue;
        }
        BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                .getQuery();
        for (int i = 0; i < bQuery2.clauses().size(); i++) {
            BooleanQuery bQueryFinal = new BooleanQuery();
            BooleanQuery bQuery3 = bQuery2.clone();
            BooleanClause removedBooleanClause = bQuery3.clauses().remove(i);
            bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST);
            bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST);
            //***************************
            // Get features
            //*************************** 
            IndexReader ir = searcher.getIndexSearch().getIndexReader();
            TermQuery term = (TermQuery) removedBooleanClause.getQuery();
            double tf = removedBooleanClause.getQuery().getBoost();// Term frequency
            double ln_tf = Math.log(1 + tf);// Get log of the term frequency
            int totalTF = ir.docFreq(term.getTerm());
            int docs = ir.getDocCount(term.getTerm().field());
            double idf = 0;
            if (totalTF != 0) {
                idf = Math.log10((double) docs / (totalTF));// Inverse document frequency
            }
            double tfidf = ln_tf * idf;// Compute the TFIDF
            int tLength = term.getTerm().text().length();// Term length
            int qSize = 0;
            if (term.getTerm().field().endsWith(PatentDocument.Title)) {
                qSize = query.getTitleSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) {
                qSize = query.getAbstractSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Description)) {
                qSize = query.getDescriptionSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) {
                qSize = query.getClaimsSize(); // Query size
            }
            double ratioTerm = (double) tf / qSize;
            int nbrUniqTerms = bQuery2.getClauses().length;
            long totalTermFreq = ir.totalTermFreq(term.getTerm());
            double ln_totalTermFreq = Math.log(1 + totalTermFreq);
            double scq = ln_totalTermFreq * idf;
            double freqInTitle = query.getFreqInTitle(term.getTerm().text());
            double ratioInTitle = (double) freqInTitle / query.getTitleSize();
            double freqAbstract = query.getFreqInAbstract(term.getTerm().text());
            double ratioInAbstract = (double) freqAbstract / query.getAbstractSize();
            double freqDescription = query.getFreqInDescription(term.getTerm().text());
            double ratioInDescription = (double) freqDescription / query.getDescriptionSize();
            double freqClaims = query.getFreqInClaims(term.getTerm().text());
            double ratioInClaims = (double) freqClaims / query.getClaimsSize();
            double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field());
            double SCS = 0;
            double ictf = 0;
            List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field());
            double a1 = 0;
            for (TermFreqVector vec : docsTermVector) {
                a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms());
            }
            double clarity = 0;
            if (totalTermFreq != 0) {
                SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score
                ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency
                clarity = a1 * Log2(a1 / Pcoll);
            }
            double QC = totalTF / (double) docs;// QueryScope

            //***************************
            System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf
                    + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize
                    + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle
                    + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t"
                    + freqClaims + "\t" + ratioInClaims);
        }
        long end2 = System.currentTimeMillis();
        //            System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + ".");
    }
    long end = System.currentTimeMillis();
    long millis = (end - start);
    System.err.println("#Global Execution time: " + Functions.getTimer(millis) + ".");
}

From source file:org.getopt.luke.HighFreqTerms.java

License:Apache License

public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
    //BytesRef br = termtext;
    long totalTF = 0;
    try {/*w  ww .ja va  2s. c  o  m*/
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        totalTF = reader.getSumTotalTermFreq(field);
        return totalTF;
    } catch (Exception e) {
        return 0;
    }
}

From source file:searching.QueryExpansion.java

/**
 * //from ww w  . j a v  a  2s .  c  om
 * @return 
 */
public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length)
        throws IOException {

    Map<String, Double> expansion_terms = new TreeMap();
    Map<String, Double> map;
    Double f;
    Double e, prob;
    Double df;
    Double sum_df = (double) reader.getSumDocFreq("text");
    Double cf;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    Double score_norm = 0.0;
    if (QueryExpansion.method == QueryExpansion.PDCM) {

        //logger.info(actual_pdocs + " docs" + this.pdocs.length);
        //expansion_terms = this.DCM().estimateDCM();
        //not implemented here

    } else if (QueryExpansion.method == QueryExpansion.SMM) {

        //get SMM estimates

        expansion_terms = this.SMM(reader, 20);

    } else {

        for (int i = 0; i < pseudo_rel_docs; i++) {

            map = this.pdocs[i];
            if (map != null) {

                double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega
                        / (map.size() * (1 - QueryExpansion.spud_omega)
                                + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega);
                double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu);

                for (String term : map.keySet()) {

                    double tf = (double) map.get(term);

                    if (!term.contains(":")) {
                        df = (double) reader.docFreq(new Term("text", term));
                        cf = (double) reader.totalTermFreq(new Term("text", term));
                        //logger.info(new Term(term) + "\t" + df + "\t" + sum_df);
                        //RM3
                        if (QueryExpansion.method == QueryExpansion.RM3) {
                            //RM3 with u=0
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);

                            //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) +  spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]);

                        } else if (QueryExpansion.method == QueryExpansion.DIRQTM) {
                            //Dir Topic Model
                            e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf))))
                                    * Math.exp(this.pdoc_scores[i]);
                        } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM)
                                || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) {
                            //SPUD Topic Model
                            prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df))));
                            e = prob * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM1) {
                            //Positional Relevance Model 1
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM2) {
                            //Positional Relevance Model 2
                            e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else {
                            //default RM3
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        }

                        f = expansion_terms.get(term);
                        if (f == null) {
                            expansion_terms.put(term, e);
                        } else {
                            expansion_terms.put(term, e + f);
                        }
                    }

                }

                score_norm += Math.exp(this.pdoc_scores[i]);
                //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i]));

            }

        }

    }

    Double norm = 0.0, topic_prob;
    Double topical_mass = 0.0;
    int t = 0;
    //sort
    ArrayList list = sortValue(expansion_terms);

    //create query-topic_model for QTM probability
    TreeMap<String, Double> query_topic_model = new TreeMap();
    for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) {

        Double tsv = (double) ((Map.Entry) list.get(i)).getValue();
        String term = ((Map.Entry) list.get(i)).getKey().toString();
        topic_prob = tsv / score_norm;
        topical_mass += topic_prob;

        norm += tsv;
        t++;

        query_topic_model.put(term, topic_prob);
        //System.out.println(term + "\t" + topic_prob + "\t" +  (double)((Map.Entry)list.get(i)).getValue());
    }

    /*
    if (QueryExpansion.method == QueryExpansion.SPUDQTM2){
    Double gen = this.QueryModelLikelihood(reader, query, query_topic_model);
    logger.info("Topic score " + gen + "\t" + query.mass());
    QueryExpansion.interpolation =  gen;
    }
    */

    //now just grab the selected terms and normalised to sum to 1.0
    TreeMap<String, Double> selected_terms = new TreeMap();
    double sum = 0;
    for (int i = 0; (i < t) && (i < list.size()); i++) {

        f = (double) ((Map.Entry) list.get(i)).getValue();
        ((Map.Entry) list.get(i)).setValue(f / norm);
        selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm);
        sum += f / norm;
    }

    return selected_terms;
}

From source file:searching.QueryExpansion.java

/**
 * //ww w  .ja  va  2  s. c  om
 * estimate the SMM using Expectation Maximization for Multinomial
 * 
 * @return 
 */
private Map<String, Double> SMM(IndexReader reader, int iterations) throws IOException {

    double avg_dl = 0.0;
    double mass = 0.0;
    for (int i = 0; i < this.actual_pdocs; i++) {

        mass += (double) pdoc_lengths[i];
    }

    //double lambda = 0.0 ;
    //get initial estimate counts

    Map<String, Double> counts = new TreeMap();
    Double f, est;
    for (int i = 0; i < this.actual_pdocs; i++) {

        if (pdocs[i] != null) {
            for (String term : this.pdocs[i].keySet()) {
                f = this.pdocs[i].get(term);

                est = counts.get(term);

                if (est == null) {
                    counts.put(term, f);
                } else {
                    counts.put(term, f + est);
                }
            }
        }
    }

    //now we have initial estimates of the maximum likelhood multinomial
    //use EM to find likelihood given the background model and fixed mixture parameter

    TreeMap<String, Double> rel_likelihoods = new TreeMap();
    Double cf, ptF, ptC, rl, co;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    for (int i = 0; i < iterations; i++) {

        //E-step (update relative likelihoods)
        for (String w : counts.keySet()) {
            cf = (double) reader.totalTermFreq(new Term("text", w));
            ptF = (1 - ssm_lambda) * counts.get(w) / mass;
            ptC = (ssm_lambda) * cf / sum_cf;
            rl = ptF / (ptF + ptC);
            rel_likelihoods.put(w, rl);
        }

        //M-step (recalculate max-likelihood of estimates given relative likelihoods)
        mass = 0.0;
        for (String w : counts.keySet()) {
            co = counts.get(w);
            rl = rel_likelihoods.get(w);
            mass += co * rl;
            counts.put(w, co * rl);
        }

        //logger.info("iter " + i + "\t"  + mass + " ");

    }

    //normalise partial count vector by updated mass and return

    for (String w : counts.keySet()) {
        counts.put(w, counts.get(w) / mass);
    }

    return counts;

}

From source file:utils.HighFreqTerms.java

License:Apache License

public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
    // BytesRef br = termtext;
    long totalTF = 0;
    try {/*w w  w . j  a  va 2  s .  c o m*/
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        totalTF = reader.getSumTotalTermFreq(field);
        return totalTF;
    } catch (Exception e) {
        return 0;
    }
}