List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testCloneIndex() throws Exception { System.out.println("Cloning index:"); Directory dir1 = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir1); Directory dir2 = FSDirectory.open(tempDir2); IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir2, config); LeafReader leafReader = reader.leaves().get(0).reader(); CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader); writer.addIndexes(new MyFilterCodecReader(codecReader)); writer.commit();//from ww w. ja v a 2s.c o m writer.forceMerge(1); writer.close(); reader.close(); // Open up the cloned index and verify it. reader = DirectoryReader.open(dir2); assertEquals(3, reader.numDocs()); assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader); assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Calculate the scores (weights) of each term that occured in the reranking pool. * The Process:/*from w ww.j a v a 2s . co m*/ * 1. For each query term, calculate its score for each term in the reranking pool. the score * is calcuated as * <pre> * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)} * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)} * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)} * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)} * </pre> * 2. For each query term the scores of every other term in the reranking pool are stored in a * PriorityQueue, only the top {@code K} are kept. * 3. Add the scores of the same term together and pick the top {@code M} ones. * * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs * @param context An instance of RerankerContext * @return Map<String, Double> Top terms and their weight scores in a HashMap */ private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList, RerankerContext<T> context) throws IOException { class ScoreComparator implements Comparator<Pair<String, Double>> { public int compare(Pair<String, Double> a, Pair<String, Double> b) { int cmp = Double.compare(b.getRight(), a.getRight()); if (cmp == 0) { return a.getLeft().compareToIgnoreCase(b.getLeft()); } else { return cmp; } } } // get collection statistics so that we can get idf later on. IndexReader reader; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); } else { IndexSearcher searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs(); //calculate the Mutual Information between term with each query term List<String> queryTerms = context.getQueryTokens(); Map<String, Integer> queryTermsCounts = new HashMap<>(); for (String qt : queryTerms) { queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1); } Set<Integer> allDocIds = new HashSet<>(); for (Set<Integer> s : termInvertedList.values()) { allDocIds.addAll(s); } int docIdsCount = allDocIds.size(); // Each priority queue corresponds to a query term: The p-queue itself stores all terms // in the reranking pool and their reranking scores to the query term. List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>(); for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) { String queryTerm = q.getKey(); long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm)); if (df == 0L) { continue; } float idf = (float) Math.log((1 + docCount) / df); int qtf = q.getValue(); if (termInvertedList.containsKey(queryTerm)) { PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator()); double selfMI = computeMutualInformation(termInvertedList.get(queryTerm), termInvertedList.get(queryTerm), docIdsCount); for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) { double score; if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1 score = idf * qtf; } else { double crossMI = computeMutualInformation(termInvertedList.get(queryTerm), termEntry.getValue(), docIdsCount); score = idf * beta * qtf * crossMI / selfMI; } termScorePQ.add(Pair.of(termEntry.getKey(), score)); } allTermScoresPQ.add(termScorePQ); } } Map<String, Double> aggTermScores = new HashMap<>(); for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) { for (int i = 0; i < Math.min(termScores.size(), this.K); i++) { Pair<String, Double> termScore = termScores.poll(); String term = termScore.getLeft(); Double score = termScore.getRight(); if (score - 0.0 > 1e-8) { aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score); } } } PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator()); for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) { termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size())); } Map<String, Double> resultTermScores = new HashMap<>(); for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) { Pair<String, Double> termScore = termScoresPQ.poll(); String term = termScore.getKey(); double score = termScore.getValue(); resultTermScores.put(term, score); } return resultTermScores; }
From source file:io.anserini.rerank.lib.Rm3Reranker.java
License:Apache License
private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) { FeatureVector f = new FeatureVector(); try {/*w ww .ja v a 2 s .co m*/ int numDocs = reader.numDocs(); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); if (term.length() < 2 || term.length() > 20) continue; if (!term.matches("[a-z0-9]+")) continue; // This seemingly arbitrary logic needs some explanation. See following PR for details: // https://github.com/castorini/Anserini/pull/289 // // We have long known that stopwords have a big impact in RM3. If we include stopwords // in feedback, effectiveness is affected negatively. In the previous implementation, we // built custom stopwords lists by selecting top k terms from the collection. We only // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all // collections other than Twitter. // // The logic below instead uses a df threshold: If a term appears in more than n percent // of the documents, then it is discarded as a feedback term. This heuristic has the // advantage of getting rid of collection-specific stopwords lists, but at the cost of // introducing an additional tuning parameter. // // Cognizant of the dangers of (essentially) tuning on test data, here's what I // (@lintool) did: // // + For newswire collections, I picked a number, 10%, that seemed right. This value // actually increased effectiveness in most conditions across all newswire collections. // // + This 10% value worked fine on web collections; effectiveness didn't change much. // // Since this was the first and only heuristic value I selected, we're not really tuning // parameters. // // The 10% threshold, however, doesn't work well on tweets because tweets are much // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection, // I found a threshold close to a nice round number that approximated the length of the // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did // this again for the Tweets2013 collection, using the same approach, and obtained a value // of 0.7%. // // With both values, we obtained effectiveness pretty close to the old values with the // custom stopwords list. int df = reader.docFreq(new Term(FIELD_BODY, term)); float ratio = (float) df / numDocs; if (tweetsearch) { if (numDocs > 100000000) { // Probably Tweets2013 if (ratio > 0.007f) continue; } else { if (ratio > 0.01f) continue; } } else if (ratio > 0.1f) continue; int freq = (int) termsEnum.totalTermFreq(); f.addFeatureWeight(term, (float) freq); } } catch (Exception e) { e.printStackTrace(); // Return empty feature vector return f; } return f; }
From source file:io.anserini.util.ExtractTopDfTerms.java
License:Apache License
public static void main(String[] args) throws Exception { Args myArgs = new Args(); CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90)); try {//from w ww .j a v a 2 s.c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } Directory dir = FSDirectory.open(Paths.get(myArgs.index)); IndexReader reader = DirectoryReader.open(dir); int numDocs = reader.numDocs(); Comparator<Pair> comp = new Comparator<Pair>() { @Override public int compare(Pair p1, Pair p2) { if (p1.value == p2.value) { return p1.key.compareTo(p2.key); } else return (p1.value < p2.value) ? -1 : 1; } }; PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp); LOG.info("Starting to iterate through all terms..."); Terms terms = MultiFields.getFields(reader).terms(myArgs.field); TermsEnum termsEnum = terms.iterator(); BytesRef text; int cnt = 0; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); if (term.length() == 0) continue; Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term))); if (queue.size() < myArgs.topK) { queue.add(p); } else { if (comp.compare(p, queue.peek()) > 0) { queue.poll(); queue.add(p); } } cnt++; if (cnt % 1000000 == 0) { LOG.info("At term " + term); } } PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output))); Pair pair; while ((pair = queue.poll()) != null) { out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs)); } out.close(); LOG.info("Done!"); }
From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java
License:Apache License
@Test public void testReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); assertEquals(ids.length, reader.maxDoc()); assertEquals(ids.length, reader.numDocs()); reader.close();// ww w . j av a2 s .c om }
From source file:io.datalayer.lucene.read.LuceneReaderTest.java
License:Apache License
@Test public void testReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); assertEquals(keywords.length, reader.maxDoc()); assertEquals(keywords.length, reader.numDocs()); reader.close();//ww w . ja va 2s . co m }
From source file:irlucene.CFCRetrieval.java
public ScoreDoc[] query(QueryData queryData, float titleBoost) { HashMap<String, Float> boosts; MultiFieldQueryParser queryParser;// w w w . ja va2 s . c o m Query q; IndexReader indexReader; IndexSearcher indexSearcher; TopDocs docs; ScoreDoc[] hits = null; try { boosts = new HashMap<>(); if (titleBoost != 0) { boosts.put("title", titleBoost); } queryParser = new MultiFieldQueryParser( new String[] { "paperNumber", "recordNumber", "acessionNumber", "authors", "title", "source", "majorSubjects", "minorSubjects", "abstractExtract", "references", "citations" }, analyzer, boosts); q = queryParser.parse(queryData.getQuery()); indexReader = DirectoryReader.open(index); indexSearcher = new IndexSearcher(indexReader); docs = indexSearcher.search(q, indexReader.numDocs()); hits = docs.scoreDocs; indexReader.close(); } catch (ParseException | IOException ex) { Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex); } return hits; }
From source file:irlucene.CFCRetrieval.java
public int numDocs() { int numDocs = 0; try {/* w w w .ja v a 2 s. com*/ IndexReader indexReader = DirectoryReader.open(index); numDocs = indexReader.numDocs(); } catch (IOException ex) { Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex); } return numDocs; }
From source file:irlucene.MEDRetrieval.java
public ScoreDoc[] query(QueryData queryData) { HashMap<String, Float> boosts; MultiFieldQueryParser queryParser;//w w w.ja va2 s . com Query q; IndexReader indexReader; IndexSearcher indexSearcher; TopDocs docs; ScoreDoc[] hits = null; try { boosts = new HashMap<>(); queryParser = new MultiFieldQueryParser(new String[] { "id", "content" }, analyzer, boosts); q = queryParser.parse(queryData.getQuery()); indexReader = DirectoryReader.open(index); indexSearcher = new IndexSearcher(indexReader); docs = indexSearcher.search(q, indexReader.numDocs()); hits = docs.scoreDocs; indexReader.close(); } catch (ParseException | IOException ex) { Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex); } return hits; }
From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java
License:Apache License
/** * Loads the map containing the conversion from the Wikipedia ids to the * Lucene Ids.//from ww w . ja v a 2s .c om */ protected void parseWikiIdToLuceneId() { logger.warn("no index wikiID -> lucene found - I'll generate"); IndexReader reader = getReader(); wikiIdToLuceneId = new HashMap<Integer, Integer>(reader.numDocs()); ProgressLogger pl = new ProgressLogger("creating wiki2lucene, readed {} docs", 100000); int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { pl.up(); try { Document doc = reader.document(i); IndexableField f = doc.getField(LUCENE_ARTICLE_ID); Integer wikiId = new Integer(f.stringValue()); wikiIdToLuceneId.put(wikiId, i); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }