Example usage for org.apache.lucene.index IndexReader getSumDocFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getSumDocFreq.

Prototype

public abstract long getSumDocFreq(String field) throws IOException;

Source Link

Document

Returns the sum of TermsEnum#docFreq() for all terms in this field.

Usage

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;//  w  w  w . j ava  2  s . c o m

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * /*from w  w w  .j  ava 2  s  .com*/
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:searching.QueryExpansion.java

/**
 * //ww w  .  ja va2 s .c o  m
 * @return 
 */
public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length)
        throws IOException {

    Map<String, Double> expansion_terms = new TreeMap();
    Map<String, Double> map;
    Double f;
    Double e, prob;
    Double df;
    Double sum_df = (double) reader.getSumDocFreq("text");
    Double cf;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    Double score_norm = 0.0;
    if (QueryExpansion.method == QueryExpansion.PDCM) {

        //logger.info(actual_pdocs + " docs" + this.pdocs.length);
        //expansion_terms = this.DCM().estimateDCM();
        //not implemented here

    } else if (QueryExpansion.method == QueryExpansion.SMM) {

        //get SMM estimates

        expansion_terms = this.SMM(reader, 20);

    } else {

        for (int i = 0; i < pseudo_rel_docs; i++) {

            map = this.pdocs[i];
            if (map != null) {

                double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega
                        / (map.size() * (1 - QueryExpansion.spud_omega)
                                + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega);
                double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu);

                for (String term : map.keySet()) {

                    double tf = (double) map.get(term);

                    if (!term.contains(":")) {
                        df = (double) reader.docFreq(new Term("text", term));
                        cf = (double) reader.totalTermFreq(new Term("text", term));
                        //logger.info(new Term(term) + "\t" + df + "\t" + sum_df);
                        //RM3
                        if (QueryExpansion.method == QueryExpansion.RM3) {
                            //RM3 with u=0
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);

                            //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) +  spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]);

                        } else if (QueryExpansion.method == QueryExpansion.DIRQTM) {
                            //Dir Topic Model
                            e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf))))
                                    * Math.exp(this.pdoc_scores[i]);
                        } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM)
                                || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) {
                            //SPUD Topic Model
                            prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df))));
                            e = prob * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM1) {
                            //Positional Relevance Model 1
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM2) {
                            //Positional Relevance Model 2
                            e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else {
                            //default RM3
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        }

                        f = expansion_terms.get(term);
                        if (f == null) {
                            expansion_terms.put(term, e);
                        } else {
                            expansion_terms.put(term, e + f);
                        }
                    }

                }

                score_norm += Math.exp(this.pdoc_scores[i]);
                //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i]));

            }

        }

    }

    Double norm = 0.0, topic_prob;
    Double topical_mass = 0.0;
    int t = 0;
    //sort
    ArrayList list = sortValue(expansion_terms);

    //create query-topic_model for QTM probability
    TreeMap<String, Double> query_topic_model = new TreeMap();
    for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) {

        Double tsv = (double) ((Map.Entry) list.get(i)).getValue();
        String term = ((Map.Entry) list.get(i)).getKey().toString();
        topic_prob = tsv / score_norm;
        topical_mass += topic_prob;

        norm += tsv;
        t++;

        query_topic_model.put(term, topic_prob);
        //System.out.println(term + "\t" + topic_prob + "\t" +  (double)((Map.Entry)list.get(i)).getValue());
    }

    /*
    if (QueryExpansion.method == QueryExpansion.SPUDQTM2){
    Double gen = this.QueryModelLikelihood(reader, query, query_topic_model);
    logger.info("Topic score " + gen + "\t" + query.mass());
    QueryExpansion.interpolation =  gen;
    }
    */

    //now just grab the selected terms and normalised to sum to 1.0
    TreeMap<String, Double> selected_terms = new TreeMap();
    double sum = 0;
    for (int i = 0; (i < t) && (i < list.size()); i++) {

        f = (double) ((Map.Entry) list.get(i)).getValue();
        ((Map.Entry) list.get(i)).setValue(f / norm);
        selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm);
        sum += f / norm;
    }

    return selected_terms;
}