List of usage examples for org.apache.lucene.index IndexReader totalTermFreq
public abstract long totalTermFreq(Term term) throws IOException;
From source file:feedback.RelevanceModelIId.java
public float getQueryClarity(IndexReader reader) throws Exception { float klDiv = 0; float p_w_C;//from w ww . jav a 2s. c om // For each v \in V (vocab of top ranked documents) for (Map.Entry<String, RetrievedDocTermInfo> e : retrievedDocsTermStats.termStats.entrySet()) { RetrievedDocTermInfo w = e.getValue(); double sumCf = (double) reader.getSumTotalTermFreq(TrecDocIndexer.FIELD_ANALYZED_CONTENT); double cf = reader.totalTermFreq(new Term(TrecDocIndexer.FIELD_ANALYZED_CONTENT, w.wvec.getWord())); p_w_C = (float) (cf / sumCf); klDiv += w.wt * Math.log(w.wt / p_w_C); } return klDiv; }
From source file:IR.LuceneModel.java
public static void main(String[] args) throws IOException { System.out.println(/*from w w w . j a v a2 s. c om*/ "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); LuceneModel indexer = null; try { indexLocation = s; indexer = new LuceneModel(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector;//= TopScoreDocCollector.create(100, true); s = ""; ScoreDoc[] hits; while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } File queryFile = new File(s); BufferedReader r = new BufferedReader(new FileReader(queryFile)); String query;//= r.readLine(); int count = 0; String q1 = "LuceneResults.txt"; File luceneFile = new File(q1); luceneFile.createNewFile(); FileWriter writer = new FileWriter(luceneFile); while ((query = r.readLine()) != null) { try { count++; collector = TopScoreDocCollector.create(100, true); QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", analyzer); Query q = parser.parse(query.replace('/', ' ')); searcher.search(q, collector); hits = collector.topDocs().scoreDocs; int query_id; query_id = count; // change this for new query System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println(query_id + ". " + d.get("path").replaceAll(".html", "") + " " + (i + 1) + " " + hits[i].score + " LuceneModel"); writer.write(String .format(query_id + " " + "Q0" + " " + d.get("path").replaceAll(".html", "") + " " + (i + 1) + " " + hits[i].score + " LuceneModel\n")); writer.flush(); // System.out.println(fmt.format(""+query_id,"Q0",""+d.get("path"),""+(i + 1),""+hits[i].score)); } } catch (Exception e) { // System.out.println(e.printStackTrace()); e.printStackTrace(); continue; } // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); // r.close(); } r.close(); writer.close(); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }
From source file:ir.project.TFIDFMatrix.java
private void createMatrix() { try {// w w w . j av a2s . co m this.matrix = new TFIDFBookVector[numDocs]; IndexReader reader = DirectoryReader.open(this.index); for (int i = 0; i < numDocs; i++) { Terms vector = reader.getTermVector(i, "text"); // get title IndexableField titleField = reader.document(i).getField("title"); String title = titleField.stringValue(); // get isbn IndexableField isbnField = reader.document(i).getField("isbn"); String isbn = isbnField.stringValue(); // get author IndexableField authorField = reader.document(i).getField("author"); String author = authorField.stringValue(); this.matrix[i] = new TFIDFBookVector(numTerms, title, isbn, author); if (vector == null) { System.err.println("Vector is null"); continue; } TermsEnum it = vector.iterator(); while (it.next() != null) { Term t = new Term("text", it.term().utf8ToString()); // TotalTermFreq returns frequency of term in document. Long tf = it.totalTermFreq(); double idf = (double) 1 / (double) reader.totalTermFreq(t); double tfIdfWeight = tf * idf; // put TF-IDF weight in matrix int termIndex = this.termMap.get(it.term().utf8ToString()); this.matrix[i].editValue(termIndex, tfIdfWeight); } } reader.close(); } catch (IOException ex) { Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:lucene.searchengine.LuceneSearchEngine.java
public static void main(String[] args) throws IOException { System.out.println(/*from w ww . j a va 2 s.com*/ "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); LuceneSearchEngine indexer = null; try { indexLocation = s; indexer = new LuceneSearchEngine(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); //=========================================================== // GET Term frequency //=========================================================== // Creating a output file to store the term,term_frequency pairs. PrintWriter tfwriter = new PrintWriter("..\\term-frequency.csv"); Fields fields = MultiFields.getFields(reader); HashMap<String, Long> tfmap = new HashMap<String, Long>(); Terms terms = fields.terms("contents"); TermsEnum termsEnum = terms.iterator(null); BytesRef bref = null; while ((bref = termsEnum.next()) != null) { String term_name = new String(bref.bytes, bref.offset, bref.length); Term term_instance = new Term("contents", term_name); long termFrequency = reader.totalTermFreq(term_instance); tfmap.put(term_name, termFrequency); } System.out.println(tfmap.size()); for (String key : tfmap.keySet()) { tfwriter.write(key + "," + tfmap.get(key)); tfwriter.write("\n"); } tfwriter.close(); //==================================================================== // Code END to fetch term frequency //==================================================================== IndexSearcher searcher = new IndexSearcher(reader); s = ""; while (!s.equalsIgnoreCase("q")) { TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("filename") + " score=" + hits[i].score); } // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }
From source file:lucenesearche.HW3.java
public static void main(String[] args) throws IOException { System.out.println(/*from w w w. j a va 2s .co m*/ "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); HW3 indexer = null; try { indexLocation = s; indexer = new HW3(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); Formatter f = new Formatter(); s = ""; while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results String query1, query2, query3, query4; query1 = "Lucene_Q1_top100.txt"; query2 = "Lucene_Q2_top100.txt"; query3 = "Lucene_Q3_top100.txt"; query4 = "Lucene_Q4_top100.txt"; File luceneFile = new File(query4); // change filename for each query int query_id; query_id = 4; // change this for new query luceneFile.createNewFile(); FileWriter writer = new FileWriter(luceneFile); writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0", "Document Name", "Rank", "Cosine Similarity Score", "System Name\n")); System.out.println("Found " + hits.length + " hits."); System.out.println(f.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0", "Document Name", "Rank", "Cosine Similarity Score", "System Name")); for (int i = 0; i < hits.length; ++i) { Formatter fmt = new Formatter(); int docId = hits[i].doc; Document d = searcher.doc(docId); //System.out.println((i+1) +". " + d.get("path")+" "+ hits[i].score); writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0", "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001\n")); writer.flush(); System.out.println(fmt.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0", "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001")); } writer.close(); // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }
From source file:nicta.com.au.patent.pac.terms.impact.FeaturesSelection.java
public void iterateOverQueryTerms() throws ParseException, Exception { long start = System.currentTimeMillis(); int l = 0;//from w w w .java2s .co m // System.out.println("queryid\tterm\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\t" // + "nbrUniqTerms\tqSize\tscq\tisInTitle\tisInAbstract\tisInDescription\tisInClaims"); System.out.println( "queryid\tremovedBooleanClause\ttf\tln_tf\tidf\ttfidf\ttLength\tratioTerm\tnbrUniqTerms\tqSize\tscq\tSCS\tictf\tQC\tclarity\tfreqInTitle\tratioInTitle\tfreqDescription\tratioInDescription\tfreqClaims\tratioInClaims"); for (Map.Entry<String, PatentDocument> e : topics.getTopics().entrySet()) { l++; String queryid = e.getKey(); PatentDocument pt = e.getValue(); // System.err.print(l + "- " + queryid + " -> " + pt.getUcid() + ": "); long start2 = System.currentTimeMillis(); PatentQuery query = new PatentQuery(pt, boosts, filter, stopWords); BooleanQuery bQuery = (BooleanQuery) query.parse(); if (bQuery.getClauses().length != 2 || !(bQuery.getClauses()[1].getQuery() instanceof BooleanQuery) || ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses().length == 0 || !(((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery() instanceof BooleanQuery)) { continue; } BooleanQuery bQuery2 = (BooleanQuery) ((BooleanQuery) bQuery.getClauses()[1].getQuery()).getClauses()[0] .getQuery(); for (int i = 0; i < bQuery2.clauses().size(); i++) { BooleanQuery bQueryFinal = new BooleanQuery(); BooleanQuery bQuery3 = bQuery2.clone(); BooleanClause removedBooleanClause = bQuery3.clauses().remove(i); bQueryFinal.add((Query) bQuery.getClauses()[0].getQuery(), BooleanClause.Occur.MUST); bQueryFinal.add(bQuery3, BooleanClause.Occur.MUST); //*************************** // Get features //*************************** IndexReader ir = searcher.getIndexSearch().getIndexReader(); TermQuery term = (TermQuery) removedBooleanClause.getQuery(); double tf = removedBooleanClause.getQuery().getBoost();// Term frequency double ln_tf = Math.log(1 + tf);// Get log of the term frequency int totalTF = ir.docFreq(term.getTerm()); int docs = ir.getDocCount(term.getTerm().field()); double idf = 0; if (totalTF != 0) { idf = Math.log10((double) docs / (totalTF));// Inverse document frequency } double tfidf = ln_tf * idf;// Compute the TFIDF int tLength = term.getTerm().text().length();// Term length int qSize = 0; if (term.getTerm().field().endsWith(PatentDocument.Title)) { qSize = query.getTitleSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Abstract)) { qSize = query.getAbstractSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Description)) { qSize = query.getDescriptionSize(); // Query size } else if (term.getTerm().field().endsWith(PatentDocument.Claims)) { qSize = query.getClaimsSize(); // Query size } double ratioTerm = (double) tf / qSize; int nbrUniqTerms = bQuery2.getClauses().length; long totalTermFreq = ir.totalTermFreq(term.getTerm()); double ln_totalTermFreq = Math.log(1 + totalTermFreq); double scq = ln_totalTermFreq * idf; double freqInTitle = query.getFreqInTitle(term.getTerm().text()); double ratioInTitle = (double) freqInTitle / query.getTitleSize(); double freqAbstract = query.getFreqInAbstract(term.getTerm().text()); double ratioInAbstract = (double) freqAbstract / query.getAbstractSize(); double freqDescription = query.getFreqInDescription(term.getTerm().text()); double ratioInDescription = (double) freqDescription / query.getDescriptionSize(); double freqClaims = query.getFreqInClaims(term.getTerm().text()); double ratioInClaims = (double) freqClaims / query.getClaimsSize(); double Pcoll = (double) totalTermFreq / ir.getSumTotalTermFreq(term.getTerm().field()); double SCS = 0; double ictf = 0; List<TermFreqVector> docsTermVector = getDocsTerms(searcher.search(term), term.getTerm().field()); double a1 = 0; for (TermFreqVector vec : docsTermVector) { a1 += Math.sqrt((double) vec.getFreq(term.getTerm().text()) / vec.numberOfTerms()); } double clarity = 0; if (totalTermFreq != 0) { SCS = ratioTerm * Log2(ratioTerm / Pcoll);// Simplified Clarity Score ictf = Math.log10((double) docs / (totalTermFreq));// Inverse Collection Term Frequency clarity = a1 * Log2(a1 / Pcoll); } double QC = totalTF / (double) docs;// QueryScope //*************************** System.out.println(queryid + "\t" + removedBooleanClause + "\t" + tf + "\t" + ln_tf + "\t" + idf + "\t" + tfidf + "\t" + tLength + "\t" + ratioTerm + "\t" + nbrUniqTerms + "\t" + qSize + "\t" + scq + "\t" + SCS + "\t" + ictf + "\t" + QC + "\t" + clarity + "\t" + freqInTitle + "\t" + ratioInTitle + "\t" + freqDescription + "\t" + ratioInDescription + "\t" + freqClaims + "\t" + ratioInClaims); } long end2 = System.currentTimeMillis(); // System.err.println(bQuery2.clauses().size() + " terms processed in " + Functions.getTimer(end2 - start2) + "."); } long end = System.currentTimeMillis(); long millis = (end - start); System.err.println("#Global Execution time: " + Functions.getTimer(millis) + "."); }
From source file:org.dkpro.tc.features.ngram.LuceneNgramDocumentTest.java
License:Apache License
private int getTermFreq(File luceneFolder, String string) throws Exception { @SuppressWarnings("deprecation") IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder)); Term term = new Term("ngram" + EXTRACTOR_ID, string); return (int) idxReader.totalTermFreq(term); }
From source file:org.dkpro.tc.features.ngram.LuceneNgramUnitTest.java
License:Apache License
private int getTermFreq(File luceneFolder, String string) throws Exception { @SuppressWarnings("deprecation") IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder)); Term term = new Term("ngram" + EXTRACTOR_NAME, string); return (int) idxReader.totalTermFreq(term); }
From source file:searching.QueryExpansion.java
/** * /*w w w .j a v a2s .c o m*/ * @return */ public Map<String, Double> expansionTerms(IndexReader reader, CustomQuery query, int query_length) throws IOException { Map<String, Double> expansion_terms = new TreeMap(); Map<String, Double> map; Double f; Double e, prob; Double df; Double sum_df = (double) reader.getSumDocFreq("text"); Double cf; Double sum_cf = (double) reader.getSumTotalTermFreq("text"); Double score_norm = 0.0; if (QueryExpansion.method == QueryExpansion.PDCM) { //logger.info(actual_pdocs + " docs" + this.pdocs.length); //expansion_terms = this.DCM().estimateDCM(); //not implemented here } else if (QueryExpansion.method == QueryExpansion.SMM) { //get SMM estimates expansion_terms = this.SMM(reader, 20); } else { for (int i = 0; i < pseudo_rel_docs; i++) { map = this.pdocs[i]; if (map != null) { double spud_pi = SPUDLMSimilarity.b0 * QueryExpansion.spud_omega / (map.size() * (1 - QueryExpansion.spud_omega) + SPUDLMSimilarity.b0 * QueryExpansion.spud_omega); double dir_pi = SPUDLMSimilarity.dir_mu / (this.pdoc_lengths[i] + SPUDLMSimilarity.dir_mu); for (String term : map.keySet()) { double tf = (double) map.get(term); if (!term.contains(":")) { df = (double) reader.docFreq(new Term("text", term)); cf = (double) reader.totalTermFreq(new Term("text", term)); //logger.info(new Term(term) + "\t" + df + "\t" + sum_df); //RM3 if (QueryExpansion.method == QueryExpansion.RM3) { //RM3 with u=0 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); //e = ((1-spud_pi)*((double) tf / this.pdoc_lengths[i]) + spud_pi*(df / sum_df)) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.DIRQTM) { //Dir Topic Model e = (((double) ((1 - dir_pi) * tf / this.pdoc_lengths[i]) / (((1 - dir_pi) * tf / this.pdoc_lengths[i]) + dir_pi * (cf / sum_cf)))) * Math.exp(this.pdoc_scores[i]); } else if ((QueryExpansion.method == QueryExpansion.SPUDQTM) || (QueryExpansion.method == QueryExpansion.SPUDQTM2)) { //SPUD Topic Model prob = (((double) ((1 - spud_pi) * tf / this.pdoc_lengths[i]) / (((1 - spud_pi) * tf / this.pdoc_lengths[i]) + spud_pi * (df / sum_df)))); e = prob * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM1) { //Positional Relevance Model 1 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else if (QueryExpansion.method == QueryExpansion.PRM2) { //Positional Relevance Model 2 e = ((double) tf / this.pdoc_positional_lengths[i]) * Math.exp(this.pdoc_scores[i]); } else { //default RM3 e = ((double) tf / this.pdoc_lengths[i]) * Math.exp(this.pdoc_scores[i]); } f = expansion_terms.get(term); if (f == null) { expansion_terms.put(term, e); } else { expansion_terms.put(term, e + f); } } } score_norm += Math.exp(this.pdoc_scores[i]); //logger.info(i + "\t" + Math.exp(this.pdoc_scores[i])); } } } Double norm = 0.0, topic_prob; Double topical_mass = 0.0; int t = 0; //sort ArrayList list = sortValue(expansion_terms); //create query-topic_model for QTM probability TreeMap<String, Double> query_topic_model = new TreeMap(); for (int i = 0; (i < num_expansion_terms) && (i < list.size()); i++) { Double tsv = (double) ((Map.Entry) list.get(i)).getValue(); String term = ((Map.Entry) list.get(i)).getKey().toString(); topic_prob = tsv / score_norm; topical_mass += topic_prob; norm += tsv; t++; query_topic_model.put(term, topic_prob); //System.out.println(term + "\t" + topic_prob + "\t" + (double)((Map.Entry)list.get(i)).getValue()); } /* if (QueryExpansion.method == QueryExpansion.SPUDQTM2){ Double gen = this.QueryModelLikelihood(reader, query, query_topic_model); logger.info("Topic score " + gen + "\t" + query.mass()); QueryExpansion.interpolation = gen; } */ //now just grab the selected terms and normalised to sum to 1.0 TreeMap<String, Double> selected_terms = new TreeMap(); double sum = 0; for (int i = 0; (i < t) && (i < list.size()); i++) { f = (double) ((Map.Entry) list.get(i)).getValue(); ((Map.Entry) list.get(i)).setValue(f / norm); selected_terms.put(((Map.Entry) list.get(i)).getKey().toString(), f / norm); sum += f / norm; } return selected_terms; }
From source file:searching.QueryExpansion.java
/** * /*from ww w . j a v a2s .co m*/ * estimate the SMM using Expectation Maximization for Multinomial * * @return */ private Map<String, Double> SMM(IndexReader reader, int iterations) throws IOException { double avg_dl = 0.0; double mass = 0.0; for (int i = 0; i < this.actual_pdocs; i++) { mass += (double) pdoc_lengths[i]; } //double lambda = 0.0 ; //get initial estimate counts Map<String, Double> counts = new TreeMap(); Double f, est; for (int i = 0; i < this.actual_pdocs; i++) { if (pdocs[i] != null) { for (String term : this.pdocs[i].keySet()) { f = this.pdocs[i].get(term); est = counts.get(term); if (est == null) { counts.put(term, f); } else { counts.put(term, f + est); } } } } //now we have initial estimates of the maximum likelhood multinomial //use EM to find likelihood given the background model and fixed mixture parameter TreeMap<String, Double> rel_likelihoods = new TreeMap(); Double cf, ptF, ptC, rl, co; Double sum_cf = (double) reader.getSumTotalTermFreq("text"); for (int i = 0; i < iterations; i++) { //E-step (update relative likelihoods) for (String w : counts.keySet()) { cf = (double) reader.totalTermFreq(new Term("text", w)); ptF = (1 - ssm_lambda) * counts.get(w) / mass; ptC = (ssm_lambda) * cf / sum_cf; rl = ptF / (ptF + ptC); rel_likelihoods.put(w, rl); } //M-step (recalculate max-likelihood of estimates given relative likelihoods) mass = 0.0; for (String w : counts.keySet()) { co = counts.get(w); rl = rel_likelihoods.get(w); mass += co * rl; counts.put(w, co * rl); } //logger.info("iter " + i + "\t" + mass + " "); } //normalise partial count vector by updated mass and return for (String w : counts.keySet()) { counts.put(w, counts.get(w) / mass); } return counts; }