List of usage examples for org.apache.lucene.index IndexReader getSumDocFreq
public abstract long getSumDocFreq(String field) throws IOException;
From source file:game.TermFreq.java
void loadTfVec() throws Exception { IndexReader reader = retriever.getReader(); long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT); Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return;// w w w . j ava 2 s . c o m TermsEnum termsEnum; BytesRef term; tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field int doclen = 0; while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); String stem = retriever.analyze(termStr); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf); tfvec.add(tfq); doclen += tf; } } for (TermFreq tf : tfvec) { tf.tf = tf.tf / (float) doclen; // normalize by len float idf = sumDf / reader.docFreq(tf.term); tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf)); } Collections.sort(tfvec); }
From source file:searching.QueryExpansion.java
/** * calculate positional relevance weights * /*from w w w .j ava 2 s .com*/ * @param query * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { //System.out.println(query); //System.out.println(text); if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap(); Integer length = 0; Long pos = 1L; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); ArrayList<Long> qpos; //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { //System.out.print(pos + ":" + term + " "); if (query.contains(term)) { qpos = query_term_pos.get(term); if (qpos == null) { qpos = new ArrayList<>(); } qpos.add(pos); query_term_pos.put(term, qpos); } length++; pos++; } } ts.end(); } finally { ts.close(); } // // All positions collected // now iterate over the document again to get weights // //System.out.println("Doc length" + text.length()); //System.out.println("Positions... "); //System.out.println(query_term_pos.toString()); //System.out.println("END..."); TreeMap<String, Double> map = new TreeMap(); Double f; pos = 1L; double w, w_norm, prob, f0; Double pos_length = 0.0; Double sum_df = (double) reader.getSumDocFreq("text"); double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega) + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega); Double df; double dist; ts = analyzer.tokenStream("myfield", new StringReader(text)); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { prob = 0.0; //f is occurrence w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma); for (String qt : query_term_pos.keySet()) { ArrayList<Long> pos_list = query_term_pos.get(qt); w = 1.0; df = (double) reader.docFreq(new Term("text", qt)); for (Long p : pos_list) { dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma); f0 = Math.exp(-dist); //if (QueryExpansion.method == QueryExpansion.PRM2QTM){ //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df)))); // w += f0; //}else{ w += f0; //} } //System.out.println("weight " + w ); prob += Math.log(w / w_norm); } //System.out.print(pos + "\t" + term + "\t" + Math.exp(prob) + "\n"); /** sum of the probabilities over the positional terms in the documents*/ f = map.get(term); if (f == null) { map.put(term, Math.exp(prob)); } else { map.put(term, f + Math.exp(prob)); } pos_length += Math.exp(prob); pos++; } } ts.end(); } finally { ts.close(); } double sum = 0.0; for (String word : map.keySet()) { //logger.info(word + "\t" + map.get(word)/pos_length); sum += map.get(word) / pos_length; } //logger.info("sum is " + sum); pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_positional_lengths[actual_pdocs] = pos_length; pdoc_scores[actual_pdocs] = doc_score; actual_pdocs++; } }
From source file:searching.QueryExpansion.java
/** * //ww w . ja va2 s .c o m * @return */ public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length) throws IOException { Map<String, Double> expansion_terms = new TreeMap(); Map<String, Double> map; Double f; Double e, prob; Double df; Double sum_df = (double) reader.getSumDocFreq("text"); Double cf; Double sum_cf = (double) reader.getSumTotalTermFreq("text"); Double score_norm = 0.0; if (QueryExpansion.method == QueryExpansion.PDCM) { //logger.info(actual_pdocs + " docs" + this.pdocs.length); //expansion_terms = this.DCM().estimateDCM(); //not implemented here } else if (QueryExpansion.method == QueryExpansion.SMM) { //get SMM estimates expansion_terms = this.SMM(reader, 20); } else { for (int i = 0; i < pseudo_rel_docs; i++) { map = this.pdocs[i]; if (map != null) { double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega / (map.size() * (1 - QueryExpansion.spud_omega) + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega); double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu); for (String term : map.keySet()) { double tf = (double) map.get(term); if (!term.contains(":")) { df = (double) reader.docFreq(new Term("text", term)); cf = (double) reader.totalTermFreq(new Term("text", term)); //logger.info(new Term(term) + "\t" + df + "\t" + sum_df); //RM3 if (QueryExpansion.method == QueryExpansion.RM3) { //RM3 with u=0 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) + spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.DIRQTM) { //Dir Topic Model e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i]) / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf)))) * Math.exp(this.pdoc_scores[i]); } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM) || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) { //SPUD Topic Model prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i]) / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df)))); e = prob * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM1) { //Positional Relevance Model 1 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM2) { //Positional Relevance Model 2 e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else { //default RM3 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } f = expansion_terms.get(term); if (f == null) { expansion_terms.put(term, e); } else { expansion_terms.put(term, e + f); } } } score_norm += Math.exp(this.pdoc_scores[i]); //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i])); } } } Double norm = 0.0, topic_prob; Double topical_mass = 0.0; int t = 0; //sort ArrayList list = sortValue(expansion_terms); //create query-topic_model for QTM probability TreeMap<String, Double> query_topic_model = new TreeMap(); for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) { Double tsv = (double) ((Map.Entry) list.get(i)).getValue(); String term = ((Map.Entry) list.get(i)).getKey().toString(); topic_prob = tsv / score_norm; topical_mass += topic_prob; norm += tsv; t++; query_topic_model.put(term, topic_prob); //System.out.println(term + "\t" + topic_prob + "\t" + (double)((Map.Entry)list.get(i)).getValue()); } /* if (QueryExpansion.method == QueryExpansion.SPUDQTM2){ Double gen = this.QueryModelLikelihood(reader, query, query_topic_model); logger.info("Topic score " + gen + "\t" + query.mass()); QueryExpansion.interpolation = gen; } */ //now just grab the selected terms and normalised to sum to 1.0 TreeMap<String, Double> selected_terms = new TreeMap(); double sum = 0; for (int i = 0; (i < t) && (i < list.size()); i++) { f = (double) ((Map.Entry) list.get(i)).getValue(); ((Map.Entry) list.get(i)).setValue(f / norm); selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm); sum += f / norm; } return selected_terms; }