List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:game.TermFreq.java
void loadTfVec() throws Exception { IndexReader reader = retriever.getReader(); long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT); Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return;/* ww w .ja va 2s. co m*/ TermsEnum termsEnum; BytesRef term; tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field int doclen = 0; while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); String stem = retriever.analyze(termStr); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf); tfvec.add(tfq); doclen += tf; } } for (TermFreq tf : tfvec) { tf.tf = tf.tf / (float) doclen; // normalize by len float idf = sumDf / reader.docFreq(tf.term); tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf)); } Collections.sort(tfvec); }
From source file:indexer.Cell.java
boolean toSplit(IndexReader reader) throws Exception { Cell parentCell = getCellIdOfParentCell(); int df = 0;//from w ww .ja v a 2 s.co m int numDocs = 0; Term parentCellTerm = new Term(DocVector.FIELD_CELL_ID, parentCell.toString()); Term thisCellTerm = new Term(DocVector.FIELD_CELL_ID, this.toString()); // Find the number of cells in this strip, e.g. // a. if the current cell is 5_2, numDocs = parentCell.validCell() ? reader.docFreq(parentCellTerm) : reader.numDocs(); df = reader.docFreq(thisCellTerm); int uniformCount = numDocs / DocVector.numIntervals; return df > uniformCount; }
From source file:indexer.Retriever.java
private String getIDF(IndexReader reader, String word) throws IOException { ClassicSimilarity similarity = new ClassicSimilarity(); int documentsFreq = 0; float idf = 0; Term term = new Term(documentField, word); int _documentsFreq = reader.docFreq(term); int documentsCount = reader.getDocCount(documentField); idf += similarity.idf(_documentsFreq, documentsCount); documentsFreq += _documentsFreq;/* www. j a va2 s .c om*/ String printString = word + ": " + idf + " (in " + documentsFreq + " documents)"; return printString; }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
private void dumpPostings(IndexReader reader) throws IOException { // This is how you iterate through terms in the postings list. LeafReader leafReader = reader.leaves().get(0).reader(); TermsEnum termsEnum = leafReader.terms("text").iterator(); BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { // This is the current term in the dictionary. String token = bytesRef.utf8ToString(); Term term = new Term("text", token); System.out.print(token + " (df = " + reader.docFreq(term) + "):"); PostingsEnum postingsEnum = leafReader.postings(term); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq())); }//from w w w . j a v a 2s . co m System.out.println(""); bytesRef = termsEnum.next(); } }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testReadingPostings() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); assertEquals(3, reader.numDocs());//from ww w . j av a 2s . co m assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader); assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testCloneIndex() throws Exception { System.out.println("Cloning index:"); Directory dir1 = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir1); Directory dir2 = FSDirectory.open(tempDir2); IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir2, config); LeafReader leafReader = reader.leaves().get(0).reader(); CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader); writer.addIndexes(new MyFilterCodecReader(codecReader)); writer.commit();//from w w w . j a va 2 s .c o m writer.forceMerge(1); writer.close(); reader.close(); // Open up the cloned index and verify it. reader = DirectoryReader.open(dir2); assertEquals(3, reader.numDocs()); assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader); assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Calculate the scores (weights) of each term that occured in the reranking pool. * The Process:// ww w . j a va2 s. c o m * 1. For each query term, calculate its score for each term in the reranking pool. the score * is calcuated as * <pre> * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)} * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)} * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)} * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)} * </pre> * 2. For each query term the scores of every other term in the reranking pool are stored in a * PriorityQueue, only the top {@code K} are kept. * 3. Add the scores of the same term together and pick the top {@code M} ones. * * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs * @param context An instance of RerankerContext * @return Map<String, Double> Top terms and their weight scores in a HashMap */ private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList, RerankerContext<T> context) throws IOException { class ScoreComparator implements Comparator<Pair<String, Double>> { public int compare(Pair<String, Double> a, Pair<String, Double> b) { int cmp = Double.compare(b.getRight(), a.getRight()); if (cmp == 0) { return a.getLeft().compareToIgnoreCase(b.getLeft()); } else { return cmp; } } } // get collection statistics so that we can get idf later on. IndexReader reader; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); } else { IndexSearcher searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs(); //calculate the Mutual Information between term with each query term List<String> queryTerms = context.getQueryTokens(); Map<String, Integer> queryTermsCounts = new HashMap<>(); for (String qt : queryTerms) { queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1); } Set<Integer> allDocIds = new HashSet<>(); for (Set<Integer> s : termInvertedList.values()) { allDocIds.addAll(s); } int docIdsCount = allDocIds.size(); // Each priority queue corresponds to a query term: The p-queue itself stores all terms // in the reranking pool and their reranking scores to the query term. List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>(); for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) { String queryTerm = q.getKey(); long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm)); if (df == 0L) { continue; } float idf = (float) Math.log((1 + docCount) / df); int qtf = q.getValue(); if (termInvertedList.containsKey(queryTerm)) { PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator()); double selfMI = computeMutualInformation(termInvertedList.get(queryTerm), termInvertedList.get(queryTerm), docIdsCount); for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) { double score; if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1 score = idf * qtf; } else { double crossMI = computeMutualInformation(termInvertedList.get(queryTerm), termEntry.getValue(), docIdsCount); score = idf * beta * qtf * crossMI / selfMI; } termScorePQ.add(Pair.of(termEntry.getKey(), score)); } allTermScoresPQ.add(termScorePQ); } } Map<String, Double> aggTermScores = new HashMap<>(); for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) { for (int i = 0; i < Math.min(termScores.size(), this.K); i++) { Pair<String, Double> termScore = termScores.poll(); String term = termScore.getLeft(); Double score = termScore.getRight(); if (score - 0.0 > 1e-8) { aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score); } } } PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator()); for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) { termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size())); } Map<String, Double> resultTermScores = new HashMap<>(); for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) { Pair<String, Double> termScore = termScoresPQ.poll(); String term = termScore.getKey(); double score = termScore.getValue(); resultTermScores.put(term, score); } return resultTermScores; }
From source file:io.anserini.rerank.lib.Rm3Reranker.java
License:Apache License
private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) { FeatureVector f = new FeatureVector(); try {/*from w w w. j ava 2s .c o m*/ int numDocs = reader.numDocs(); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); if (term.length() < 2 || term.length() > 20) continue; if (!term.matches("[a-z0-9]+")) continue; // This seemingly arbitrary logic needs some explanation. See following PR for details: // https://github.com/castorini/Anserini/pull/289 // // We have long known that stopwords have a big impact in RM3. If we include stopwords // in feedback, effectiveness is affected negatively. In the previous implementation, we // built custom stopwords lists by selecting top k terms from the collection. We only // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all // collections other than Twitter. // // The logic below instead uses a df threshold: If a term appears in more than n percent // of the documents, then it is discarded as a feedback term. This heuristic has the // advantage of getting rid of collection-specific stopwords lists, but at the cost of // introducing an additional tuning parameter. // // Cognizant of the dangers of (essentially) tuning on test data, here's what I // (@lintool) did: // // + For newswire collections, I picked a number, 10%, that seemed right. This value // actually increased effectiveness in most conditions across all newswire collections. // // + This 10% value worked fine on web collections; effectiveness didn't change much. // // Since this was the first and only heuristic value I selected, we're not really tuning // parameters. // // The 10% threshold, however, doesn't work well on tweets because tweets are much // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection, // I found a threshold close to a nice round number that approximated the length of the // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did // this again for the Tweets2013 collection, using the same approach, and obtained a value // of 0.7%. // // With both values, we obtained effectiveness pretty close to the old values with the // custom stopwords list. int df = reader.docFreq(new Term(FIELD_BODY, term)); float ratio = (float) df / numDocs; if (tweetsearch) { if (numDocs > 100000000) { // Probably Tweets2013 if (ratio > 0.007f) continue; } else { if (ratio > 0.01f) continue; } } else if (ratio > 0.1f) continue; int freq = (int) termsEnum.totalTermFreq(); f.addFeatureWeight(term, (float) freq); } } catch (Exception e) { e.printStackTrace(); // Return empty feature vector return f; } return f; }
From source file:io.anserini.util.ExtractTopDfTerms.java
License:Apache License
public static void main(String[] args) throws Exception { Args myArgs = new Args(); CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90)); try {//from ww w . j av a 2 s. c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } Directory dir = FSDirectory.open(Paths.get(myArgs.index)); IndexReader reader = DirectoryReader.open(dir); int numDocs = reader.numDocs(); Comparator<Pair> comp = new Comparator<Pair>() { @Override public int compare(Pair p1, Pair p2) { if (p1.value == p2.value) { return p1.key.compareTo(p2.key); } else return (p1.value < p2.value) ? -1 : 1; } }; PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp); LOG.info("Starting to iterate through all terms..."); Terms terms = MultiFields.getFields(reader).terms(myArgs.field); TermsEnum termsEnum = terms.iterator(); BytesRef text; int cnt = 0; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); if (term.length() == 0) continue; Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term))); if (queue.size() < myArgs.topK) { queue.add(p); } else { if (comp.compare(p, queue.peek()) > 0) { queue.poll(); queue.add(p); } } cnt++; if (cnt % 1000000 == 0) { LOG.info("At term " + term); } } PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output))); Pair pair; while ((pair = queue.poll()) != null) { out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs)); } out.close(); LOG.info("Done!"); }
From source file:IR.LuceneModel.java
public static void main(String[] args) throws IOException { System.out.println(/*w w w . ja v a 2 s . c o m*/ "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); LuceneModel indexer = null; try { indexLocation = s; indexer = new LuceneModel(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector;//= TopScoreDocCollector.create(100, true); s = ""; ScoreDoc[] hits; while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } File queryFile = new File(s); BufferedReader r = new BufferedReader(new FileReader(queryFile)); String query;//= r.readLine(); int count = 0; String q1 = "LuceneResults.txt"; File luceneFile = new File(q1); luceneFile.createNewFile(); FileWriter writer = new FileWriter(luceneFile); while ((query = r.readLine()) != null) { try { count++; collector = TopScoreDocCollector.create(100, true); QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", analyzer); Query q = parser.parse(query.replace('/', ' ')); searcher.search(q, collector); hits = collector.topDocs().scoreDocs; int query_id; query_id = count; // change this for new query System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println(query_id + ". " + d.get("path").replaceAll(".html", "") + " " + (i + 1) + " " + hits[i].score + " LuceneModel"); writer.write(String .format(query_id + " " + "Q0" + " " + d.get("path").replaceAll(".html", "") + " " + (i + 1) + " " + hits[i].score + " LuceneModel\n")); writer.flush(); // System.out.println(fmt.format(""+query_id,"Q0",""+d.get("path"),""+(i + 1),""+hits[i].score)); } } catch (Exception e) { // System.out.println(e.printStackTrace()); e.printStackTrace(); continue; } // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); // r.close(); } r.close(); writer.close(); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }