Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:searcher.CollStat.java

public HashMap<Integer, Integer> chooseIndexHitOrder(HttpSession session, String query) throws Exception {
    HashMap<Integer, Integer> hitMap;

    String key = SESSION_ATTR_HITORDER + query;
    hitMap = (HashMap<Integer, Integer>) session.getAttribute(key);
    if (hitMap != null)
        return hitMap;

    hitMap = new HashMap<>();
    String[] queryWords = analyze(query).split("\\s+");

    CollStat[] sumDFs = new CollStat[readers.length];
    for (int i = 0; i < readers.length; i++) {
        sumDFs[i] = new CollStat(i);
        IndexReader reader = readers[i];

        for (String queryWord : queryWords) {
            sumDFs[i].sumDF += reader.docFreq(new Term(TrecDocIndexer.FIELD_ANALYZED_CONTENT, queryWord));
        }//from   w  ww. j a va  2s.  co m
    }
    Arrays.sort(sumDFs);

    for (int j = sumDFs.length - 1; j >= 0; j--) {
        hitMap.put(sumDFs.length - j - 1, sumDFs[j].indexNum);
    }
    session.setAttribute(key, hitMap);
    return hitMap;
}

From source file:searching.QueryExpansion.java

/**
 * calculate positional relevance weights
 * /*w  w w.j av  a  2  s. c  o m*/
 * @param query
 * @param text
 * @param doc_score
 * @param analyzer
 * @param reader
 * @throws IOException 
 */

public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer,
        IndexReader reader) throws IOException {

    //System.out.println(query);
    //System.out.println(text);

    if (actual_pdocs < QueryExpansion.pseudo_rel_docs) {

        TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap();
        Integer length = 0;

        Long pos = 1L;
        String term;
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text));

        ArrayList<Long> qpos;
        //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    //System.out.print(pos + ":" + term + " ");
                    if (query.contains(term)) {
                        qpos = query_term_pos.get(term);
                        if (qpos == null) {
                            qpos = new ArrayList<>();
                        }
                        qpos.add(pos);
                        query_term_pos.put(term, qpos);
                    }

                    length++;
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        //
        // All positions collected
        // now iterate over the document again to get weights
        //
        //System.out.println("Doc length" + text.length());
        //System.out.println("Positions... ");
        //System.out.println(query_term_pos.toString());
        //System.out.println("END...");
        TreeMap<String, Double> map = new TreeMap();
        Double f;
        pos = 1L;
        double w, w_norm, prob, f0;
        Double pos_length = 0.0;
        Double sum_df = (double) reader.getSumDocFreq("text");
        double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega
                / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega)
                        + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega);
        Double df;
        double dist;

        ts = analyzer.tokenStream("myfield", new StringReader(text));
        try {
            ts.reset(); // Resets this stream to the beginning. (Required)
            while (ts.incrementToken()) {
                term = ts.getAttribute(CharTermAttribute.class).toString();
                if (term.length() > 1) {

                    prob = 0.0;
                    //f is occurrence
                    w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma);
                    for (String qt : query_term_pos.keySet()) {
                        ArrayList<Long> pos_list = query_term_pos.get(qt);
                        w = 1.0;
                        df = (double) reader.docFreq(new Term("text", qt));
                        for (Long p : pos_list) {
                            dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma);
                            f0 = Math.exp(-dist);

                            //if (QueryExpansion.method == QueryExpansion.PRM2QTM){
                            //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df))));
                            //    w += f0;
                            //}else{
                            w += f0;
                            //}

                        }
                        //System.out.println("weight " + w );
                        prob += Math.log(w / w_norm);
                    }

                    //System.out.print(pos + "\t" + term + "\t" +  Math.exp(prob) + "\n");

                    /** sum of the probabilities over the positional terms in the documents*/
                    f = map.get(term);
                    if (f == null) {
                        map.put(term, Math.exp(prob));
                    } else {
                        map.put(term, f + Math.exp(prob));
                    }
                    pos_length += Math.exp(prob);
                    pos++;
                }
            }
            ts.end();
        } finally {
            ts.close();
        }

        double sum = 0.0;
        for (String word : map.keySet()) {
            //logger.info(word + "\t" + map.get(word)/pos_length);
            sum += map.get(word) / pos_length;
        }
        //logger.info("sum is " + sum);

        pdocs[actual_pdocs] = map;
        pdoc_lengths[actual_pdocs] = length;
        pdoc_positional_lengths[actual_pdocs] = pos_length;
        pdoc_scores[actual_pdocs] = doc_score;

        actual_pdocs++;
    }
}

From source file:searching.QueryExpansion.java

/**
 * /*  w  ww.  j a  va  2 s  .co m*/
 * @return 
 */
public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length)
        throws IOException {

    Map<String, Double> expansion_terms = new TreeMap();
    Map<String, Double> map;
    Double f;
    Double e, prob;
    Double df;
    Double sum_df = (double) reader.getSumDocFreq("text");
    Double cf;
    Double sum_cf = (double) reader.getSumTotalTermFreq("text");

    Double score_norm = 0.0;
    if (QueryExpansion.method == QueryExpansion.PDCM) {

        //logger.info(actual_pdocs + " docs" + this.pdocs.length);
        //expansion_terms = this.DCM().estimateDCM();
        //not implemented here

    } else if (QueryExpansion.method == QueryExpansion.SMM) {

        //get SMM estimates

        expansion_terms = this.SMM(reader, 20);

    } else {

        for (int i = 0; i < pseudo_rel_docs; i++) {

            map = this.pdocs[i];
            if (map != null) {

                double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega
                        / (map.size() * (1 - QueryExpansion.spud_omega)
                                + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega);
                double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu);

                for (String term : map.keySet()) {

                    double tf = (double) map.get(term);

                    if (!term.contains(":")) {
                        df = (double) reader.docFreq(new Term("text", term));
                        cf = (double) reader.totalTermFreq(new Term("text", term));
                        //logger.info(new Term(term) + "\t" + df + "\t" + sum_df);
                        //RM3
                        if (QueryExpansion.method == QueryExpansion.RM3) {
                            //RM3 with u=0
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);

                            //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) +  spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]);

                        } else if (QueryExpansion.method == QueryExpansion.DIRQTM) {
                            //Dir Topic Model
                            e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf))))
                                    * Math.exp(this.pdoc_scores[i]);
                        } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM)
                                || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) {
                            //SPUD Topic Model
                            prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i])
                                    / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df))));
                            e = prob * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM1) {
                            //Positional Relevance Model 1
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else if (QueryExpansion.method == QueryExpansion.PRM2) {
                            //Positional Relevance Model 2
                            e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        } else {
                            //default RM3
                            e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]);
                        }

                        f = expansion_terms.get(term);
                        if (f == null) {
                            expansion_terms.put(term, e);
                        } else {
                            expansion_terms.put(term, e + f);
                        }
                    }

                }

                score_norm += Math.exp(this.pdoc_scores[i]);
                //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i]));

            }

        }

    }

    Double norm = 0.0, topic_prob;
    Double topical_mass = 0.0;
    int t = 0;
    //sort
    ArrayList list = sortValue(expansion_terms);

    //create query-topic_model for QTM probability
    TreeMap<String, Double> query_topic_model = new TreeMap();
    for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) {

        Double tsv = (double) ((Map.Entry) list.get(i)).getValue();
        String term = ((Map.Entry) list.get(i)).getKey().toString();
        topic_prob = tsv / score_norm;
        topical_mass += topic_prob;

        norm += tsv;
        t++;

        query_topic_model.put(term, topic_prob);
        //System.out.println(term + "\t" + topic_prob + "\t" +  (double)((Map.Entry)list.get(i)).getValue());
    }

    /*
    if (QueryExpansion.method == QueryExpansion.SPUDQTM2){
    Double gen = this.QueryModelLikelihood(reader, query, query_topic_model);
    logger.info("Topic score " + gen + "\t" + query.mass());
    QueryExpansion.interpolation =  gen;
    }
    */

    //now just grab the selected terms and normalised to sum to 1.0
    TreeMap<String, Double> selected_terms = new TreeMap();
    double sum = 0;
    for (int i = 0; (i < t) && (i < list.size()); i++) {

        f = (double) ((Map.Entry) list.get(i)).getValue();
        ((Map.Entry) list.get(i)).setValue(f / norm);
        selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm);
        sum += f / norm;
    }

    return selected_terms;
}

From source file:servlets.TermStatsComparator.java

/**
 * Processes requests for both HTTP <code>GET</code> and <code>POST</code>
 * methods.//  w w  w . j a  v  a 2  s .c  om
 *
 * @param request servlet request
 * @param response servlet response
 * @throws ServletException if a servlet-specific error occurs
 * @throws IOException if an I/O error occurs
 */
protected void processRequest(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    response.setContentType("text/html;charset=UTF-8");
    try (PrintWriter out = response.getWriter()) {
        /* TODO output your page here. You may use following sample code. */
        IndexReader reader = retriever.getReader();
        String term = request.getParameter("term");

        if (isNumber(term)) {
            TermStats[] termStats = null;
            try {
                termStats = HighFreqTerms.getHighFreqTerms(reader, Integer.parseInt(term),
                        TrecDocRetriever.FIELD_ANALYZED_CONTENT, new TermStatsComparator());
            } catch (Exception ex) {
                out.println("Error in obtaining term stats");
            }
            if (termStats == null)
                out.println("Error in obtaining term stats");

            StringBuffer responseBuff = new StringBuffer("<table><tbody>");
            responseBuff.append("<tr>").append("<th>").append("Term").append("</th>").append("<th>")
                    .append("Doc Freq").append("</th>").append("<th>").append("Coll Freq").append("</th>")
                    .append("</tr>");

            for (TermStats ts : termStats) {
                responseBuff.append("<tr>").append("<td>").append(ts.termtext.utf8ToString()).append("</td>")
                        .append("<td>").append(ts.docFreq).append("</td>").append("<td>")
                        .append(ts.totalTermFreq).append("</td>").append("</tr>");
            }
            responseBuff.append("</tbody></table>");
            out.println(responseBuff.toString());
        } else {
            String analyzedTerm = analyze(term);
            Term t = new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, analyzedTerm);
            int docFreq = reader.docFreq(t);
            long collFreq = reader.totalTermFreq(t);
            out.println("Doc freq: " + docFreq + "&nbsp;&nbsp;" + "Coll Freq: " + collFreq);
        }
    }
}

From source file:spell.SpellIndex.java

License:Apache License

/**
 * Suggest similar words (restricted to a field of a user index or not).
 *
 * @param word String the word you want a spell check done on
 * @param num_sug int the number of suggest words
 * @param ir the indexReader of the user index (can be null, see
 * parameter "field")//from w  ww.  j a  v a 2s. c o  m
 * @param field String the field of the user index: if field is
 * not null, the suggested words are restricted to the words
 * present in this field.
 * @param morePopular boolean return only the suggest words that
 * are more frequent than the searched word (only if restricted
 * mode = (indexReader!=null and field!=null)
 * @throws IOException
 * @return String[] the list of the suggest words sorted by these
 * two criteria: 1) the edit distance, 2) (only if restricted
 * mode) the popularity of the suggest words in the field of the
 * user index
 */
public ArrayList suggestSimilar(String word, int num_sug, IndexReader ir, String field, boolean morePopular)
        throws IOException {
    final TRStringDistance sd = new TRStringDistance(word);
    final int wordlen = word.length();

    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;

    // Return the word if it exists in the index and caller
    // doesn't want a more popular word.
    if (!morePopular && goalFreq > 0) {
        ArrayList result = new ArrayList();
        SuggestWord sugg = new SuggestWord();
        sugg.string = word;
        sugg.score = 1.0f;
        result.add(sugg);
        return result;
    }

    // Don't query index if word is too short
    if (wordlen < MINWORDLEN) {
        return new ArrayList();
    }

    BooleanQuery query = new BooleanQuery();
    String[] grams;
    String key;

    for (int ng = getMin(wordlen); ng <= getMax(wordlen); ng++) {
        key = "gram" + ng; // form key

        // form word into ngrams (allow dups too)
        grams = formGrams(word, ng);

        if (grams.length == 0) {
            continue; // hmm
        }

        // should we boost prefixes?
        if (bStart > 0) {
            // matches start of word
            add(query, "start" + ng, grams[0], bStart);
        }

        // should we boost suffixes?
        if (bEnd > 0) {
            // matches end of word
            add(query, "end" + ng, grams[grams.length - 1], bEnd);
        }

        for (int i = 0; i < grams.length; i++) {
            add(query, key, grams[i]);
        }
    }

    IndexSearcher searcher = new IndexSearcher(this.spellindex);
    Hits hits = searcher.search(query);
    SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);

    // go thru more than 'maxr' matches in case the distance filter triggers
    int stop = Math.min(hits.length(), 10 * num_sug);
    SuggestWord sugword = new SuggestWord();
    for (int i = 0; i < stop; i++) {
        sugword.string = hits.doc(i).get(F_WORD); // get orig word

        if (sugword.string == word) {
            // don't suggest a word for itself, that would be silly
            continue;
        }

        //edit distance/normalize with the min word length
        sugword.score = 1.0f
                - ((float) sd.getDistance(sugword.string) / Math.min(sugword.string.length(), wordlen));

        if (sugword.score < min) {
            continue;
        }

        // use the user index
        if (ir != null) {
            // freq in the index
            sugword.freq = ir.docFreq(new Term(field, sugword.string));

            // don't suggest a word that is not present in the field
            if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) {
                continue;
            }
        }

        sugqueue.insert(sugword);

        if (sugqueue.size() == num_sug) {
            //if queue full, maintain the min score
            min = ((SuggestWord) sugqueue.top()).score;
        }

        sugword = new SuggestWord();
    }

    // convert to ArrayList
    ArrayList result = new ArrayList(sugqueue.size());

    for (int i = sugqueue.size() - 1; i >= 0; i--) {
        result.add(sugqueue.pop());
    }

    searcher.close();

    return result;
}

From source file:spell.SpellIndex.java

License:Apache License

/**
 * Tests if the word exist in the index.
 *///from   ww  w .ja  va 2  s  . c o m
public boolean exist(String word) throws IOException {
    IndexReader reader = getReader();
    boolean result = reader.docFreq(new Term(F_WORD, word)) > 0;
    reader.close();

    return result;
}

From source file:vectorizer.TermInfo.java

private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception {
    DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME));
    Terms tfvector;// www. ja v  a 2 s  . c  o  m
    TermsEnum termsEnum;
    String termText;
    BytesRef term;
    int tf;
    float idf;

    tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME);

    if (tfvector == null)
        return null;

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field

    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        tf = (int) termsEnum.totalTermFreq();
        termText = term.utf8ToString();

        float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText));
        idf = (float) Math.log(1 + numDocs / df);

        TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf);
        if (dict != null) {
            Translations translations = dict.getTranslationTerms(termText);
            for (TranslationInfo tinfo : translations.getTranslationInfo()) {
                termInfo.tf *= tinfo.weight;
            }
        }

        // Update global stats
        TermInfo seenTermInfo = collFreq.get(termText);
        if (seenTermInfo == null) {
            seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf);
            collFreq.put(termText, seenTermInfo);
        } else {
            seenTermInfo.tf += termInfo.tf; // coll freq
        }

        wmap.addTermInfo(termInfo);
    }

    return wmap;
}