Example usage for org.apache.lucene.index IndexReader getSumTotalTermFreq

List of usage examples for org.apache.lucene.index IndexReader getSumTotalTermFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getSumTotalTermFreq.

Prototype

public abstract long getSumTotalTermFreq(String field) throws IOException;

Source Link

Document

Returns the sum of TermsEnum#totalTermFreq for all terms in this field.

Usage

From source file:edu.umd.umiacs.clip.tools.scor.BM25Scorer.java

License:Apache License

public BM25Scorer(IndexReader ir, String field) {
    super(ir, field);
    k1 = 1.2f;// ww w.  ja v  a2s  .c  o m
    b = 0.75f;
    try {
        avgdl = ir.getSumTotalTermFreq(field) / (float) ir.numDocs();
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    cache = new float[(int) (avgdl * 10)];
    for (int i = 0; i < cache.length; i++) {
        cache[i] = k1 * (1 - b + b * (i / avgdl));
    }
}

From source file:feedback.RelevanceModelIId.java

public float getQueryClarity(IndexReader reader) throws Exception {
    float klDiv = 0;
    float p_w_C;//from w  w w  .  j  a va 2s .com
    // For each v \in V (vocab of top ranked documents)
    for (Map.Entry<String, RetrievedDocTermInfo> e : retrievedDocsTermStats.termStats.entrySet()) {
        RetrievedDocTermInfo w = e.getValue();
        double sumCf = (double) reader.getSumTotalTermFreq(TrecDocIndexer.FIELD_ANALYZED_CONTENT);
        double cf = reader.totalTermFreq(new Term(TrecDocIndexer.FIELD_ANALYZED_CONTENT, w.wvec.getWord()));
        p_w_C = (float) (cf / sumCf);
        klDiv += w.wt * Math.log(w.wt / p_w_C);
    }
    return klDiv;
}

From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java

License:Apache License

public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
    BytesRef br = termtext;/*from   ww  w . j av  a  2  s .c  o m*/
    long totalTF = 0;
    try {
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        totalTF = reader.getSumTotalTermFreq(field);
        return totalTF;
    } catch (Exception e) {
        return 0;
    }
}

From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java

public void iterateOverQueryTerms() throws ParseException, Exception {
    long start = System.currentTimeMillis();
    int l = 0;/*from ww w. j ava 2s. c o  m*/
    //        System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t"
    //                + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims");

    System.out.println(
            "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims");
    for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) {
        l++;
        String queryid = e.getKey();
        PatentDocument pt = e.getValue();
        //            System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": ");
        long start2 = System.currentTimeMillis();
        PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords);
        BooleanQuery bQuery = (BooleanQuery) query.parse();
        if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery)
                || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0
                || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                        .getQuery() instanceof BooleanQuery)) {
            continue;
        }
        BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0]
                .getQuery();
        for (int i = 0; i < bQuery2.clauses().size(); i++) {
            BooleanQuery bQueryFinal = new BooleanQuery();
            BooleanQuery bQuery3 = bQuery2.clone();
            BooleanClause removedBooleanClause = bQuery3.clauses().remove(i);
            bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST);
            bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST);
            //***************************
            // Get features
            //*************************** 
            IndexReader ir = searcher.getIndexSearch().getIndexReader();
            TermQuery term = (TermQuery) removedBooleanClause.getQuery();
            double tf = removedBooleanClause.getQuery().getBoost();// Term frequency
            double ln_tf = Math.log(1 + tf);// Get log of the term frequency
            int totalTF = ir.docFreq(term.getTerm());
            int docs = ir.getDocCount(term.getTerm().field());
            double idf = 0;
            if (totalTF != 0) {
                idf = Math.log10((double) docs / (totalTF));// Inverse document frequency
            }
            double tfidf = ln_tf * idf;// Compute the TFIDF
            int tLength = term.getTerm().text().length();// Term length
            int qSize = 0;
            if (term.getTerm().field().endsWith(PatentDocument.Title)) {
                qSize = query.getTitleSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) {
                qSize = query.getAbstractSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Description)) {
                qSize = query.getDescriptionSize(); // Query size
            } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) {
                qSize = query.getClaimsSize(); // Query size
            }
            double ratioTerm = (double) tf / qSize;
            int nbrUniqTerms = bQuery2.getClauses().length;
            long totalTermFreq = ir.totalTermFreq(term.getTerm());
            double ln_totalTermFreq = Math.log(1 + totalTermFreq);
            double scq = ln_totalTermFreq * idf;
            double freqInTitle = query.getFreqInTitle(term.getTerm().text());
            double ratioInTitle = (double) freqInTitle / query.getTitleSize();
            double freqAbstract = query.getFreqInAbstract(term.getTerm().text());
            double ratioInAbstract = (double) freqAbstract / query.getAbstractSize();
            double freqDescription = query.getFreqInDescription(term.getTerm().text());
            double ratioInDescription = (double) freqDescription / query.getDescriptionSize();
            double freqClaims = query.getFreqInClaims(term.getTerm().text());
            double ratioInClaims = (double) freqClaims / query.getClaimsSize();
            double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field());
            double SCS = 0;
            double ictf = 0;
            List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field());
            double a1 = 0;
            for (TermFreqVector vec : docsTermVector) {
                a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms());
            }
            double clarity = 0;
            if (totalTermFreq != 0) {
                SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score
                ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency
                clarity = a1 * Log2(a1 / Pcoll);
            }
            double QC = totalTF / (double) docs;// QueryScope

            //***************************
            System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf
                    + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize
                    + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle
                    + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t"
                    + freqClaims + "\t" + ratioInClaims);
        }
        long end2 = System.currentTimeMillis();
        //            System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + ".");
    }
    long end = System.currentTimeMillis();
    long millis = (end - start);
    System.err.println("#Global Execution time: " + Functions.getTimer(millis) + ".");
}

From source file:org.getopt.luke.HighFreqTerms.java

License:Apache License

public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
    //BytesRef br = termtext;
    long totalTF = 0;
    try {/*w  ww .ja va  2s. c  o  m*/
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        totalTF = reader.getSumTotalTermFreq(field);
        return totalTF;
    } catch (Exception e) {
        return 0;
    }
}

From source file:searching.QueryExpansion.java

/**
 * //from ww w  . j a v  a  2s .  c  om
 * @return 
 */
public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length)
        throws IOException {

    Map<String, Double> expansion_terms = new TreeMap();
    Map<String, Double> map;
    Double f;
    Double e, prob;
    Double df;
    Double sum_df = (double) reader.getSumDocFreq("text");
    Double cf;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    Double score_norm = 0.0;
    if (QueryExpansion.method == QueryExpansion.PDCM) {

        //logger.info(actual_pdocs + " docs" + this.pdocs.length);
        //expansion_terms = this.DCM().estimateDCM();
        //not implemented here

    } else if (QueryExpansion.method == QueryExpansion.SMM) {

        //get SMM estimates

        expansion_terms = this.SMM(reader, 20);

    } else {

        for (int i = 0; i < pseudo_rel_docs; i++) {

            map = this.pdocs[i];
            if (map != null) {

                double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega
                        / (map.size() * (1 - QueryExpansion.spud_omega)
                                + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega);
                double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu);

                for (String term : map.keySet()) {

                    double tf = (double) map.get(term);

                    if (!term.contains(":")) {
                        df = (double) reader.docFreq(new Term("text", term));
                        cf = (double) reader.totalTermFreq(new Term("text", term));
                        //logger.info(new Term(term) + "\t" + df + "\t" + sum_df);
                        //RM3
                        if (QueryExpansion.method == QueryExpansion.RM3) {
                            //RM3 with u=0
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);

                            //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) +  spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]);

                        } else if (QueryExpansion.method == QueryExpansion.DIRQTM) {
                            //Dir Topic Model
                            e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf))))
                                    * Math.exp(this.pdoc_scores[i]);
                        } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM)
                                || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) {
                            //SPUD Topic Model
                            prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df))));
                            e = prob * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM1) {
                            //Positional Relevance Model 1
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM2) {
                            //Positional Relevance Model 2
                            e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else {
                            //default RM3
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        }

                        f = expansion_terms.get(term);
                        if (f == null) {
                            expansion_terms.put(term, e);
                        } else {
                            expansion_terms.put(term, e + f);
                        }
                    }

                }

                score_norm += Math.exp(this.pdoc_scores[i]);
                //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i]));

            }

        }

    }

    Double norm = 0.0, topic_prob;
    Double topical_mass = 0.0;
    int t = 0;
    //sort
    ArrayList list = sortValue(expansion_terms);

    //create query-topic_model for QTM probability
    TreeMap<String, Double> query_topic_model = new TreeMap();
    for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) {

        Double tsv = (double) ((Map.Entry) list.get(i)).getValue();
        String term = ((Map.Entry) list.get(i)).getKey().toString();
        topic_prob = tsv / score_norm;
        topical_mass += topic_prob;

        norm += tsv;
        t++;

        query_topic_model.put(term, topic_prob);
        //System.out.println(term + "\t" + topic_prob + "\t" +  (double)((Map.Entry)list.get(i)).getValue());
    }

    /*
    if (QueryExpansion.method == QueryExpansion.SPUDQTM2){
    Double gen = this.QueryModelLikelihood(reader, query, query_topic_model);
    logger.info("Topic score " + gen + "\t" + query.mass());
    QueryExpansion.interpolation =  gen;
    }
    */

    //now just grab the selected terms and normalised to sum to 1.0
    TreeMap<String, Double> selected_terms = new TreeMap();
    double sum = 0;
    for (int i = 0; (i < t) && (i < list.size()); i++) {

        f = (double) ((Map.Entry) list.get(i)).getValue();
        ((Map.Entry) list.get(i)).setValue(f / norm);
        selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm);
        sum += f / norm;
    }

    return selected_terms;
}

From source file:searching.QueryExpansion.java

/**
 * //ww w  .ja  va  2  s. c  om
 * estimate the SMM using Expectation Maximization for Multinomial
 * 
 * @return 
 */
private Map<String, Double> SMM(IndexReader reader, int iterations) throws IOException {

    double avg_dl = 0.0;
    double mass = 0.0;
    for (int i = 0; i < this.actual_pdocs; i++) {

        mass += (double) pdoc_lengths[i];
    }

    //double lambda = 0.0 ;
    //get initial estimate counts

    Map<String, Double> counts = new TreeMap();
    Double f, est;
    for (int i = 0; i < this.actual_pdocs; i++) {

        if (pdocs[i] != null) {
            for (String term : this.pdocs[i].keySet()) {
                f = this.pdocs[i].get(term);

                est = counts.get(term);

                if (est == null) {
                    counts.put(term, f);
                } else {
                    counts.put(term, f + est);
                }
            }
        }
    }

    //now we have initial estimates of the maximum likelhood multinomial
    //use EM to find likelihood given the background model and fixed mixture parameter

    TreeMap<String, Double> rel_likelihoods = new TreeMap();
    Double cf, ptF, ptC, rl, co;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    for (int i = 0; i < iterations; i++) {

        //E-step (update relative likelihoods)
        for (String w : counts.keySet()) {
            cf = (double) reader.totalTermFreq(new Term("text", w));
            ptF = (1 - ssm_lambda) * counts.get(w) / mass;
            ptC = (ssm_lambda) * cf / sum_cf;
            rl = ptF / (ptF + ptC);
            rel_likelihoods.put(w, rl);
        }

        //M-step (recalculate max-likelihood of estimates given relative likelihoods)
        mass = 0.0;
        for (String w : counts.keySet()) {
            co = counts.get(w);
            rl = rel_likelihoods.get(w);
            mass += co * rl;
            counts.put(w, co * rl);
        }

        //logger.info("iter " + i + "\t"  + mass + " ");

    }

    //normalise partial count vector by updated mass and return

    for (String w : counts.keySet()) {
        counts.put(w, counts.get(w) / mass);
    }

    return counts;

}

From source file:utils.HighFreqTerms.java

License:Apache License

public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception {
    // BytesRef br = termtext;
    long totalTF = 0;
    try {/*w w  w . j  a  va 2  s .  c o m*/
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        totalTF = reader.getSumTotalTermFreq(field);
        return totalTF;
    } catch (Exception e) {
        return 0;
    }
}