List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:searcher.CollStat.java
public HashMap<Integer, Integer> chooseIndexHitOrder(HttpSession session, String query) throws Exception { HashMap<Integer, Integer> hitMap; String key = SESSION_ATTR_HITORDER + query; hitMap = (HashMap<Integer, Integer>) session.getAttribute(key); if (hitMap != null) return hitMap; hitMap = new HashMap<>(); String[] queryWords = analyze(query).split("\\s+"); CollStat[] sumDFs = new CollStat[readers.length]; for (int i = 0; i < readers.length; i++) { sumDFs[i] = new CollStat(i); IndexReader reader = readers[i]; for (String queryWord : queryWords) { sumDFs[i].sumDF += reader.docFreq(new Term(TrecDocIndexer.FIELD_ANALYZED_CONTENT, queryWord)); }//from w ww. j a va 2s. co m } Arrays.sort(sumDFs); for (int j = sumDFs.length - 1; j >= 0; j--) { hitMap.put(sumDFs.length - j - 1, sumDFs[j].indexNum); } session.setAttribute(key, hitMap); return hitMap; }
From source file:searching.QueryExpansion.java
/** * calculate positional relevance weights * /*w w w.j av a 2 s. c o m*/ * @param query * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { //System.out.println(query); //System.out.println(text); if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap(); Integer length = 0; Long pos = 1L; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); ArrayList<Long> qpos; //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { //System.out.print(pos + ":" + term + " "); if (query.contains(term)) { qpos = query_term_pos.get(term); if (qpos == null) { qpos = new ArrayList<>(); } qpos.add(pos); query_term_pos.put(term, qpos); } length++; pos++; } } ts.end(); } finally { ts.close(); } // // All positions collected // now iterate over the document again to get weights // //System.out.println("Doc length" + text.length()); //System.out.println("Positions... "); //System.out.println(query_term_pos.toString()); //System.out.println("END..."); TreeMap<String, Double> map = new TreeMap(); Double f; pos = 1L; double w, w_norm, prob, f0; Double pos_length = 0.0; Double sum_df = (double) reader.getSumDocFreq("text"); double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega) + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega); Double df; double dist; ts = analyzer.tokenStream("myfield", new StringReader(text)); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { prob = 0.0; //f is occurrence w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma); for (String qt : query_term_pos.keySet()) { ArrayList<Long> pos_list = query_term_pos.get(qt); w = 1.0; df = (double) reader.docFreq(new Term("text", qt)); for (Long p : pos_list) { dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma); f0 = Math.exp(-dist); //if (QueryExpansion.method == QueryExpansion.PRM2QTM){ //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df)))); // w += f0; //}else{ w += f0; //} } //System.out.println("weight " + w ); prob += Math.log(w / w_norm); } //System.out.print(pos + "\t" + term + "\t" + Math.exp(prob) + "\n"); /** sum of the probabilities over the positional terms in the documents*/ f = map.get(term); if (f == null) { map.put(term, Math.exp(prob)); } else { map.put(term, f + Math.exp(prob)); } pos_length += Math.exp(prob); pos++; } } ts.end(); } finally { ts.close(); } double sum = 0.0; for (String word : map.keySet()) { //logger.info(word + "\t" + map.get(word)/pos_length); sum += map.get(word) / pos_length; } //logger.info("sum is " + sum); pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_positional_lengths[actual_pdocs] = pos_length; pdoc_scores[actual_pdocs] = doc_score; actual_pdocs++; } }
From source file:searching.QueryExpansion.java
/** * /* w ww. j a va 2 s .co m*/ * @return */ public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length) throws IOException { Map<String, Double> expansion_terms = new TreeMap(); Map<String, Double> map; Double f; Double e, prob; Double df; Double sum_df = (double) reader.getSumDocFreq("text"); Double cf; Double sum_cf = (double) reader.getSumTotalTermFreq("text"); Double score_norm = 0.0; if (QueryExpansion.method == QueryExpansion.PDCM) { //logger.info(actual_pdocs + " docs" + this.pdocs.length); //expansion_terms = this.DCM().estimateDCM(); //not implemented here } else if (QueryExpansion.method == QueryExpansion.SMM) { //get SMM estimates expansion_terms = this.SMM(reader, 20); } else { for (int i = 0; i < pseudo_rel_docs; i++) { map = this.pdocs[i]; if (map != null) { double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega / (map.size() * (1 - QueryExpansion.spud_omega) + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega); double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu); for (String term : map.keySet()) { double tf = (double) map.get(term); if (!term.contains(":")) { df = (double) reader.docFreq(new Term("text", term)); cf = (double) reader.totalTermFreq(new Term("text", term)); //logger.info(new Term(term) + "\t" + df + "\t" + sum_df); //RM3 if (QueryExpansion.method == QueryExpansion.RM3) { //RM3 with u=0 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) + spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.DIRQTM) { //Dir Topic Model e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i]) / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf)))) * Math.exp(this.pdoc_scores[i]); } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM) || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) { //SPUD Topic Model prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i]) / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df)))); e = prob * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM1) { //Positional Relevance Model 1 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM2) { //Positional Relevance Model 2 e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else { //default RM3 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } f = expansion_terms.get(term); if (f == null) { expansion_terms.put(term, e); } else { expansion_terms.put(term, e + f); } } } score_norm += Math.exp(this.pdoc_scores[i]); //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i])); } } } Double norm = 0.0, topic_prob; Double topical_mass = 0.0; int t = 0; //sort ArrayList list = sortValue(expansion_terms); //create query-topic_model for QTM probability TreeMap<String, Double> query_topic_model = new TreeMap(); for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) { Double tsv = (double) ((Map.Entry) list.get(i)).getValue(); String term = ((Map.Entry) list.get(i)).getKey().toString(); topic_prob = tsv / score_norm; topical_mass += topic_prob; norm += tsv; t++; query_topic_model.put(term, topic_prob); //System.out.println(term + "\t" + topic_prob + "\t" + (double)((Map.Entry)list.get(i)).getValue()); } /* if (QueryExpansion.method == QueryExpansion.SPUDQTM2){ Double gen = this.QueryModelLikelihood(reader, query, query_topic_model); logger.info("Topic score " + gen + "\t" + query.mass()); QueryExpansion.interpolation = gen; } */ //now just grab the selected terms and normalised to sum to 1.0 TreeMap<String, Double> selected_terms = new TreeMap(); double sum = 0; for (int i = 0; (i < t) && (i < list.size()); i++) { f = (double) ((Map.Entry) list.get(i)).getValue(); ((Map.Entry) list.get(i)).setValue(f / norm); selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm); sum += f / norm; } return selected_terms; }
From source file:servlets.TermStatsComparator.java
/** * Processes requests for both HTTP <code>GET</code> and <code>POST</code> * methods.// w w w . j a v a 2 s .c om * * @param request servlet request * @param response servlet response * @throws ServletException if a servlet-specific error occurs * @throws IOException if an I/O error occurs */ protected void processRequest(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("text/html;charset=UTF-8"); try (PrintWriter out = response.getWriter()) { /* TODO output your page here. You may use following sample code. */ IndexReader reader = retriever.getReader(); String term = request.getParameter("term"); if (isNumber(term)) { TermStats[] termStats = null; try { termStats = HighFreqTerms.getHighFreqTerms(reader, Integer.parseInt(term), TrecDocRetriever.FIELD_ANALYZED_CONTENT, new TermStatsComparator()); } catch (Exception ex) { out.println("Error in obtaining term stats"); } if (termStats == null) out.println("Error in obtaining term stats"); StringBuffer responseBuff = new StringBuffer("<table><tbody>"); responseBuff.append("<tr>").append("<th>").append("Term").append("</th>").append("<th>") .append("Doc Freq").append("</th>").append("<th>").append("Coll Freq").append("</th>") .append("</tr>"); for (TermStats ts : termStats) { responseBuff.append("<tr>").append("<td>").append(ts.termtext.utf8ToString()).append("</td>") .append("<td>").append(ts.docFreq).append("</td>").append("<td>") .append(ts.totalTermFreq).append("</td>").append("</tr>"); } responseBuff.append("</tbody></table>"); out.println(responseBuff.toString()); } else { String analyzedTerm = analyze(term); Term t = new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, analyzedTerm); int docFreq = reader.docFreq(t); long collFreq = reader.totalTermFreq(t); out.println("Doc freq: " + docFreq + " " + "Coll Freq: " + collFreq); } } }
From source file:spell.SpellIndex.java
License:Apache License
/** * Suggest similar words (restricted to a field of a user index or not). * * @param word String the word you want a spell check done on * @param num_sug int the number of suggest words * @param ir the indexReader of the user index (can be null, see * parameter "field")//from w ww. j a v a 2s. c o m * @param field String the field of the user index: if field is * not null, the suggested words are restricted to the words * present in this field. * @param morePopular boolean return only the suggest words that * are more frequent than the searched word (only if restricted * mode = (indexReader!=null and field!=null) * @throws IOException * @return String[] the list of the suggest words sorted by these * two criteria: 1) the edit distance, 2) (only if restricted * mode) the popularity of the suggest words in the field of the * user index */ public ArrayList suggestSimilar(String word, int num_sug, IndexReader ir, String field, boolean morePopular) throws IOException { final TRStringDistance sd = new TRStringDistance(word); final int wordlen = word.length(); final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0; // Return the word if it exists in the index and caller // doesn't want a more popular word. if (!morePopular && goalFreq > 0) { ArrayList result = new ArrayList(); SuggestWord sugg = new SuggestWord(); sugg.string = word; sugg.score = 1.0f; result.add(sugg); return result; } // Don't query index if word is too short if (wordlen < MINWORDLEN) { return new ArrayList(); } BooleanQuery query = new BooleanQuery(); String[] grams; String key; for (int ng = getMin(wordlen); ng <= getMax(wordlen); ng++) { key = "gram" + ng; // form key // form word into ngrams (allow dups too) grams = formGrams(word, ng); if (grams.length == 0) { continue; // hmm } // should we boost prefixes? if (bStart > 0) { // matches start of word add(query, "start" + ng, grams[0], bStart); } // should we boost suffixes? if (bEnd > 0) { // matches end of word add(query, "end" + ng, grams[grams.length - 1], bEnd); } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } } IndexSearcher searcher = new IndexSearcher(this.spellindex); Hits hits = searcher.search(query); SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.min(hits.length(), 10 * num_sug); SuggestWord sugword = new SuggestWord(); for (int i = 0; i < stop; i++) { sugword.string = hits.doc(i).get(F_WORD); // get orig word if (sugword.string == word) { // don't suggest a word for itself, that would be silly continue; } //edit distance/normalize with the min word length sugword.score = 1.0f - ((float) sd.getDistance(sugword.string) / Math.min(sugword.string.length(), wordlen)); if (sugword.score < min) { continue; } // use the user index if (ir != null) { // freq in the index sugword.freq = ir.docFreq(new Term(field, sugword.string)); // don't suggest a word that is not present in the field if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { continue; } } sugqueue.insert(sugword); if (sugqueue.size() == num_sug) { //if queue full, maintain the min score min = ((SuggestWord) sugqueue.top()).score; } sugword = new SuggestWord(); } // convert to ArrayList ArrayList result = new ArrayList(sugqueue.size()); for (int i = sugqueue.size() - 1; i >= 0; i--) { result.add(sugqueue.pop()); } searcher.close(); return result; }
From source file:spell.SpellIndex.java
License:Apache License
/** * Tests if the word exist in the index. *///from ww w .ja va 2 s . c o m public boolean exist(String word) throws IOException { IndexReader reader = getReader(); boolean result = reader.docFreq(new Term(F_WORD, word)) > 0; reader.close(); return result; }
From source file:vectorizer.TermInfo.java
private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception { DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME)); Terms tfvector;// www. ja v a 2 s . c o m TermsEnum termsEnum; String termText; BytesRef term; int tf; float idf; tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME); if (tfvector == null) return null; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText)); idf = (float) Math.log(1 + numDocs / df); TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf); if (dict != null) { Translations translations = dict.getTranslationTerms(termText); for (TranslationInfo tinfo : translations.getTranslationInfo()) { termInfo.tf *= tinfo.weight; } } // Update global stats TermInfo seenTermInfo = collFreq.get(termText); if (seenTermInfo == null) { seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf); collFreq.put(termText, seenTermInfo); } else { seenTermInfo.tf += termInfo.tf; // coll freq } wmap.addTermInfo(termInfo); } return wmap; }