Example usage for org.apache.lucene.index IndexReader getSumDocFreq

List of usage examples for org.apache.lucene.index IndexReader getSumDocFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getSumDocFreq.

Prototype

public abstract long getSumDocFreq(String field) throws IOException;

Source Link

Document

Returns the sum of TermsEnum#docFreq() for all terms in this field.

Usage

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;//  w  w  w . j ava  2  s . c o m

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * /*from w  w w  .j  ava 2  s  .com*/
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:searching.QueryExpansion.java

/**
 * //ww w  .  ja va2 s .c o  m
 * @return 
 */
public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length)
        throws IOException {

    Map<String, Double> expansion_terms = new TreeMap();
    Map<String, Double> map;
    Double f;
    Double e, prob;
    Double df;
    Double sum_df = (double) reader.getSumDocFreq("text");
    Double cf;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    Double score_norm = 0.0;
    if (QueryExpansion.method == QueryExpansion.PDCM) {

        //logger.info(actual_pdocs + " docs" + this.pdocs.length);
        //expansion_terms = this.DCM().estimateDCM();
        //not implemented here

    } else if (QueryExpansion.method == QueryExpansion.SMM) {

        //get SMM estimates

        expansion_terms = this.SMM(reader, 20);

    } else {

        for (int i = 0; i < pseudo_rel_docs; i++) {

            map = this.pdocs[i];
            if (map != null) {

                double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega
                        / (map.size() * (1 - QueryExpansion.spud_omega)
                                + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega);
                double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu);

                for (String term : map.keySet()) {

                    double tf = (double) map.get(term);

                    if (!term.contains(":")) {
                        df = (double) reader.docFreq(new Term("text", term));
                        cf = (double) reader.totalTermFreq(new Term("text", term));
                        //logger.info(new Term(term) + "\t" + df + "\t" + sum_df);
                        //RM3
                        if (QueryExpansion.method == QueryExpansion.RM3) {
                            //RM3 with u=0
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);

                            //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) +  spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]);

                        } else if (QueryExpansion.method == QueryExpansion.DIRQTM) {
                            //Dir Topic Model
                            e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf))))
                                    * Math.exp(this.pdoc_scores[i]);
                        } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM)
                                || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) {
                            //SPUD Topic Model
                            prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df))));
                            e = prob * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM1) {
                            //Positional Relevance Model 1
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM2) {
                            //Positional Relevance Model 2
                            e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else {
                            //default RM3
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        }

                        f = expansion_terms.get(term);
                        if (f == null) {
                            expansion_terms.put(term, e);
                        } else {
                            expansion_terms.put(term, e + f);
                        }
                    }

                }

                score_norm += Math.exp(this.pdoc_scores[i]);
                //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i]));

            }

        }

    }

    Double norm = 0.0, topic_prob;
    Double topical_mass = 0.0;
    int t = 0;
    //sort
    ArrayList list = sortValue(expansion_terms);

    //create query-topic_model for QTM probability
    TreeMap<String, Double> query_topic_model = new TreeMap();
    for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) {

        Double tsv = (double) ((Map.Entry) list.get(i)).getValue();
        String term = ((Map.Entry) list.get(i)).getKey().toString();
        topic_prob = tsv / score_norm;
        topical_mass += topic_prob;

        norm += tsv;
        t++;

        query_topic_model.put(term, topic_prob);
        //System.out.println(term + "\t" + topic_prob + "\t" +  (double)((Map.Entry)list.get(i)).getValue());
    }

    /*
    if (QueryExpansion.method == QueryExpansion.SPUDQTM2){
    Double gen = this.QueryModelLikelihood(reader, query, query_topic_model);
    logger.info("Topic score " + gen + "\t" + query.mass());
    QueryExpansion.interpolation =  gen;
    }
    */

    //now just grab the selected terms and normalised to sum to 1.0
    TreeMap<String, Double> selected_terms = new TreeMap();
    double sum = 0;
    for (int i = 0; (i < t) && (i < list.size()); i++) {

        f = (double) ((Map.Entry) list.get(i)).getValue();
        ((Map.Entry) list.get(i)).setValue(f / norm);
        selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm);
        sum += f / norm;
    }

    return selected_terms;
}