List of usage examples for org.apache.lucene.index IndexReader getSumTotalTermFreq
public abstract long getSumTotalTermFreq(String field) throws IOException;
From source file:edu.umd.umiacs.clip.tools.scor.BM25Scorer.java
License:Apache License
public BM25Scorer(IndexReader ir, String field) { super(ir, field); k1 = 1.2f;// ww w. ja v a2s .c o m b = 0.75f; try { avgdl = ir.getSumTotalTermFreq(field) / (float) ir.numDocs(); } catch (IOException e) { throw new UncheckedIOException(e); } cache = new float[(int) (avgdl * 10)]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * (1 - b + b * (i / avgdl)); } }
From source file:feedback.RelevanceModelIId.java
public float getQueryClarity(IndexReader reader) throws Exception { float klDiv = 0; float p_w_C;//from w w w . j a va 2s .com // For each v \in V (vocab of top ranked documents) for (Map.Entry<String, RetrievedDocTermInfo> e : retrievedDocsTermStats.termStats.entrySet()) { RetrievedDocTermInfo w = e.getValue(); double sumCf = (double) reader.getSumTotalTermFreq(TrecDocIndexer.FIELD_ANALYZED_CONTENT); double cf = reader.totalTermFreq(new Term(TrecDocIndexer.FIELD_ANALYZED_CONTENT, w.wvec.getWord())); p_w_C = (float) (cf / sumCf); klDiv += w.wt * Math.log(w.wt / p_w_C); } return klDiv; }
From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java
License:Apache License
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception { BytesRef br = termtext;/*from ww w . j av a 2 s .c o m*/ long totalTF = 0; try { Bits liveDocs = MultiFields.getLiveDocs(reader); totalTF = reader.getSumTotalTermFreq(field); return totalTF; } catch (Exception e) { return 0; } }
From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java
public void iterateOverQueryTerms() throws ParseException, Exception { long start = System.currentTimeMillis(); int l = 0;/*from ww w. j ava 2s. c o m*/ // System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t" // + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims"); System.out.println( "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims"); for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) { l++; String queryid = e.getKey(); PatentDocument pt = e.getValue(); // System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": "); long start2 = System.currentTimeMillis(); PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords); BooleanQuery bQuery = (BooleanQuery) query.parse(); if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery) || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0 || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery() instanceof BooleanQuery)) { continue; } BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery(); for (int i = 0; i < bQuery2.clauses().size(); i++) { BooleanQuery bQueryFinal = new BooleanQuery(); BooleanQuery bQuery3 = bQuery2.clone(); BooleanClause removedBooleanClause = bQuery3.clauses().remove(i); bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST); bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST); //*************************** // Get features //*************************** IndexReader ir = searcher.getIndexSearch().getIndexReader(); TermQuery term = (TermQuery) removedBooleanClause.getQuery(); double tf = removedBooleanClause.getQuery().getBoost();// Term frequency double ln_tf = Math.log(1 + tf);// Get log of the term frequency int totalTF = ir.docFreq(term.getTerm()); int docs = ir.getDocCount(term.getTerm().field()); double idf = 0; if (totalTF != 0) { idf = Math.log10((double) docs / (totalTF));// Inverse document frequency } double tfidf = ln_tf * idf;// Compute the TFIDF int tLength = term.getTerm().text().length();// Term length int qSize = 0; if (term.getTerm().field().endsWith(PatentDocument.Title)) { qSize = query.getTitleSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) { qSize = query.getAbstractSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Description)) { qSize = query.getDescriptionSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) { qSize = query.getClaimsSize(); // Query size } double ratioTerm = (double) tf / qSize; int nbrUniqTerms = bQuery2.getClauses().length; long totalTermFreq = ir.totalTermFreq(term.getTerm()); double ln_totalTermFreq = Math.log(1 + totalTermFreq); double scq = ln_totalTermFreq * idf; double freqInTitle = query.getFreqInTitle(term.getTerm().text()); double ratioInTitle = (double) freqInTitle / query.getTitleSize(); double freqAbstract = query.getFreqInAbstract(term.getTerm().text()); double ratioInAbstract = (double) freqAbstract / query.getAbstractSize(); double freqDescription = query.getFreqInDescription(term.getTerm().text()); double ratioInDescription = (double) freqDescription / query.getDescriptionSize(); double freqClaims = query.getFreqInClaims(term.getTerm().text()); double ratioInClaims = (double) freqClaims / query.getClaimsSize(); double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field()); double SCS = 0; double ictf = 0; List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field()); double a1 = 0; for (TermFreqVector vec : docsTermVector) { a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms()); } double clarity = 0; if (totalTermFreq != 0) { SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency clarity = a1 * Log2(a1 / Pcoll); } double QC = totalTF / (double) docs;// QueryScope //*************************** System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t" + freqClaims + "\t" + ratioInClaims); } long end2 = System.currentTimeMillis(); // System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + "."); } long end = System.currentTimeMillis(); long millis = (end - start); System.err.println("#Global Execution time: " + Functions.getTimer(millis) + "."); }
From source file:org.getopt.luke.HighFreqTerms.java
License:Apache License
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception { //BytesRef br = termtext; long totalTF = 0; try {/*w ww .ja va 2s. c o m*/ Bits liveDocs = MultiFields.getLiveDocs(reader); totalTF = reader.getSumTotalTermFreq(field); return totalTF; } catch (Exception e) { return 0; } }
From source file:searching.QueryExpansion.java
/** * //from ww w . j a v a 2s . c om * @return */ public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length) throws IOException { Map<String, Double> expansion_terms = new TreeMap(); Map<String, Double> map; Double f; Double e, prob; Double df; Double sum_df = (double) reader.getSumDocFreq("text"); Double cf; Double sum_cf = (double) reader.getSumTotalTermFreq("text"); Double score_norm = 0.0; if (QueryExpansion.method == QueryExpansion.PDCM) { //logger.info(actual_pdocs + " docs" + this.pdocs.length); //expansion_terms = this.DCM().estimateDCM(); //not implemented here } else if (QueryExpansion.method == QueryExpansion.SMM) { //get SMM estimates expansion_terms = this.SMM(reader, 20); } else { for (int i = 0; i < pseudo_rel_docs; i++) { map = this.pdocs[i]; if (map != null) { double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega / (map.size() * (1 - QueryExpansion.spud_omega) + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega); double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu); for (String term : map.keySet()) { double tf = (double) map.get(term); if (!term.contains(":")) { df = (double) reader.docFreq(new Term("text", term)); cf = (double) reader.totalTermFreq(new Term("text", term)); //logger.info(new Term(term) + "\t" + df + "\t" + sum_df); //RM3 if (QueryExpansion.method == QueryExpansion.RM3) { //RM3 with u=0 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) + spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.DIRQTM) { //Dir Topic Model e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i]) / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf)))) * Math.exp(this.pdoc_scores[i]); } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM) || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) { //SPUD Topic Model prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i]) / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df)))); e = prob * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM1) { //Positional Relevance Model 1 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM2) { //Positional Relevance Model 2 e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else { //default RM3 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } f = expansion_terms.get(term); if (f == null) { expansion_terms.put(term, e); } else { expansion_terms.put(term, e + f); } } } score_norm += Math.exp(this.pdoc_scores[i]); //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i])); } } } Double norm = 0.0, topic_prob; Double topical_mass = 0.0; int t = 0; //sort ArrayList list = sortValue(expansion_terms); //create query-topic_model for QTM probability TreeMap<String, Double> query_topic_model = new TreeMap(); for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) { Double tsv = (double) ((Map.Entry) list.get(i)).getValue(); String term = ((Map.Entry) list.get(i)).getKey().toString(); topic_prob = tsv / score_norm; topical_mass += topic_prob; norm += tsv; t++; query_topic_model.put(term, topic_prob); //System.out.println(term + "\t" + topic_prob + "\t" + (double)((Map.Entry)list.get(i)).getValue()); } /* if (QueryExpansion.method == QueryExpansion.SPUDQTM2){ Double gen = this.QueryModelLikelihood(reader, query, query_topic_model); logger.info("Topic score " + gen + "\t" + query.mass()); QueryExpansion.interpolation = gen; } */ //now just grab the selected terms and normalised to sum to 1.0 TreeMap<String, Double> selected_terms = new TreeMap(); double sum = 0; for (int i = 0; (i < t) && (i < list.size()); i++) { f = (double) ((Map.Entry) list.get(i)).getValue(); ((Map.Entry) list.get(i)).setValue(f / norm); selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm); sum += f / norm; } return selected_terms; }
From source file:searching.QueryExpansion.java
/** * //ww w .ja va 2 s. c om * estimate the SMM using Expectation Maximization for Multinomial * * @return */ private Map<String, Double> SMM(IndexReader reader, int iterations) throws IOException { double avg_dl = 0.0; double mass = 0.0; for (int i = 0; i < this.actual_pdocs; i++) { mass += (double) pdoc_lengths[i]; } //double lambda = 0.0 ; //get initial estimate counts Map<String, Double> counts = new TreeMap(); Double f, est; for (int i = 0; i < this.actual_pdocs; i++) { if (pdocs[i] != null) { for (String term : this.pdocs[i].keySet()) { f = this.pdocs[i].get(term); est = counts.get(term); if (est == null) { counts.put(term, f); } else { counts.put(term, f + est); } } } } //now we have initial estimates of the maximum likelhood multinomial //use EM to find likelihood given the background model and fixed mixture parameter TreeMap<String, Double> rel_likelihoods = new TreeMap(); Double cf, ptF, ptC, rl, co; Double sum_cf = (double) reader.getSumTotalTermFreq("text"); for (int i = 0; i < iterations; i++) { //E-step (update relative likelihoods) for (String w : counts.keySet()) { cf = (double) reader.totalTermFreq(new Term("text", w)); ptF = (1 - ssm_lambda) * counts.get(w) / mass; ptC = (ssm_lambda) * cf / sum_cf; rl = ptF / (ptF + ptC); rel_likelihoods.put(w, rl); } //M-step (recalculate max-likelihood of estimates given relative likelihoods) mass = 0.0; for (String w : counts.keySet()) { co = counts.get(w); rl = rel_likelihoods.get(w); mass += co * rl; counts.put(w, co * rl); } //logger.info("iter " + i + "\t" + mass + " "); } //normalise partial count vector by updated mass and return for (String w : counts.keySet()) { counts.put(w, counts.get(w) / mass); } return counts; }
From source file:utils.HighFreqTerms.java
License:Apache License
public static long getTotalTermFreq(IndexReader reader, String field, BytesRef termtext) throws Exception { // BytesRef br = termtext; long totalTF = 0; try {/*w w w . j a va 2 s . c o m*/ Bits liveDocs = MultiFields.getLiveDocs(reader); totalTF = reader.getSumTotalTermFreq(field); return totalTF; } catch (Exception e) { return 0; } }