Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();//from  ww w. ja v a  2s.c o m
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:/*from   w ww.j  a  v  a  2s . co  m*/
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) {
    FeatureVector f = new FeatureVector();

    try {/*w ww  .ja  v a  2  s .co  m*/
        int numDocs = reader.numDocs();
        TermsEnum termsEnum = terms.iterator();

        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();

            if (term.length() < 2 || term.length() > 20)
                continue;
            if (!term.matches("[a-z0-9]+"))
                continue;

            // This seemingly arbitrary logic needs some explanation. See following PR for details:
            //   https://github.com/castorini/Anserini/pull/289
            //
            // We have long known that stopwords have a big impact in RM3. If we include stopwords
            // in feedback, effectiveness is affected negatively. In the previous implementation, we
            // built custom stopwords lists by selecting top k terms from the collection. We only
            // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all
            // collections other than Twitter.
            //
            // The logic below instead uses a df threshold: If a term appears in more than n percent
            // of the documents, then it is discarded as a feedback term. This heuristic has the
            // advantage of getting rid of collection-specific stopwords lists, but at the cost of
            // introducing an additional tuning parameter.
            //
            // Cognizant of the dangers of (essentially) tuning on test data, here's what I
            // (@lintool) did:
            //
            // + For newswire collections, I picked a number, 10%, that seemed right. This value
            //   actually increased effectiveness in most conditions across all newswire collections.
            //
            // + This 10% value worked fine on web collections; effectiveness didn't change much.
            //
            // Since this was the first and only heuristic value I selected, we're not really tuning
            // parameters.
            //
            // The 10% threshold, however, doesn't work well on tweets because tweets are much
            // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection,
            // I found a threshold close to a nice round number that approximated the length of the
            // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did
            // this again for the Tweets2013 collection, using the same approach, and obtained a value
            // of 0.7%.
            //
            // With both values, we obtained effectiveness pretty close to the old values with the
            // custom stopwords list.
            int df = reader.docFreq(new Term(FIELD_BODY, term));
            float ratio = (float) df / numDocs;
            if (tweetsearch) {
                if (numDocs > 100000000) { // Probably Tweets2013
                    if (ratio > 0.007f)
                        continue;
                } else {
                    if (ratio > 0.01f)
                        continue;
                }
            } else if (ratio > 0.1f)
                continue;

            int freq = (int) termsEnum.totalTermFreq();
            f.addFeatureWeight(term, (float) freq);
        }
    } catch (Exception e) {
        e.printStackTrace();
        // Return empty feature vector
        return f;
    }

    return f;
}

From source file:io.anserini.util.ExtractTopDfTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from   w  ww .j  a v  a  2  s.c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    int numDocs = reader.numDocs();

    Comparator<Pair> comp = new Comparator<Pair>() {
        @Override
        public int compare(Pair p1, Pair p2) {
            if (p1.value == p2.value) {
                return p1.key.compareTo(p2.key);
            } else
                return (p1.value < p2.value) ? -1 : 1;
        }
    };

    PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);

    LOG.info("Starting to iterate through all terms...");
    Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int cnt = 0;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        if (term.length() == 0)
            continue;

        Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
        if (queue.size() < myArgs.topK) {
            queue.add(p);
        } else {
            if (comp.compare(p, queue.peek()) > 0) {
                queue.poll();
                queue.add(p);
            }
        }

        cnt++;
        if (cnt % 1000000 == 0) {
            LOG.info("At term " + term);
        }
    }

    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    Pair pair;
    while ((pair = queue.poll()) != null) {
        out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs));
    }
    out.close();

    LOG.info("Done!");
}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc());
    assertEquals(ids.length, reader.numDocs());
    reader.close();// ww  w .  j  av a2 s .c om
}

From source file:io.datalayer.lucene.read.LuceneReaderTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(keywords.length, reader.maxDoc());
    assertEquals(keywords.length, reader.numDocs());
    reader.close();//ww w .  ja  va  2s  . co m
}

From source file:irlucene.CFCRetrieval.java

public ScoreDoc[] query(QueryData queryData, float titleBoost) {
    HashMap<String, Float> boosts;
    MultiFieldQueryParser queryParser;//  w w w  .  ja  va2 s .  c o m
    Query q;
    IndexReader indexReader;
    IndexSearcher indexSearcher;
    TopDocs docs;
    ScoreDoc[] hits = null;
    try {
        boosts = new HashMap<>();
        if (titleBoost != 0) {
            boosts.put("title", titleBoost);
        }
        queryParser = new MultiFieldQueryParser(
                new String[] { "paperNumber", "recordNumber", "acessionNumber", "authors", "title", "source",
                        "majorSubjects", "minorSubjects", "abstractExtract", "references", "citations" },
                analyzer, boosts);
        q = queryParser.parse(queryData.getQuery());
        indexReader = DirectoryReader.open(index);
        indexSearcher = new IndexSearcher(indexReader);
        docs = indexSearcher.search(q, indexReader.numDocs());

        hits = docs.scoreDocs;
        indexReader.close();
    } catch (ParseException | IOException ex) {
        Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex);
    }
    return hits;
}

From source file:irlucene.CFCRetrieval.java

public int numDocs() {
    int numDocs = 0;
    try {/* w  w w  .ja  v a  2 s.  com*/
        IndexReader indexReader = DirectoryReader.open(index);
        numDocs = indexReader.numDocs();
    } catch (IOException ex) {
        Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex);
    }
    return numDocs;
}

From source file:irlucene.MEDRetrieval.java

public ScoreDoc[] query(QueryData queryData) {
    HashMap<String, Float> boosts;
    MultiFieldQueryParser queryParser;//w w  w.ja va2 s  .  com
    Query q;
    IndexReader indexReader;
    IndexSearcher indexSearcher;
    TopDocs docs;
    ScoreDoc[] hits = null;
    try {
        boosts = new HashMap<>();
        queryParser = new MultiFieldQueryParser(new String[] { "id", "content" }, analyzer, boosts);
        q = queryParser.parse(queryData.getQuery());
        indexReader = DirectoryReader.open(index);
        indexSearcher = new IndexSearcher(indexReader);
        docs = indexSearcher.search(q, indexReader.numDocs());

        hits = docs.scoreDocs;
        indexReader.close();
    } catch (ParseException | IOException ex) {
        Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex);
    }
    return hits;
}

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

/**
 * Loads the map containing the conversion from the Wikipedia ids to the
 * Lucene Ids.//from ww w .  ja  v  a  2s .c  om
 */
protected void parseWikiIdToLuceneId() {
    logger.warn("no index wikiID -> lucene found - I'll generate");
    IndexReader reader = getReader();
    wikiIdToLuceneId = new HashMap<Integer, Integer>(reader.numDocs());
    ProgressLogger pl = new ProgressLogger("creating wiki2lucene, readed {} docs", 100000);
    int numDocs = reader.numDocs();
    for (int i = 0; i < numDocs; i++) {
        pl.up();
        try {
            Document doc = reader.document(i);
            IndexableField f = doc.getField(LUCENE_ARTICLE_ID);
            Integer wikiId = new Integer(f.stringValue());
            wikiIdToLuceneId.put(wikiId, i);
        } catch (CorruptIndexException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

}