Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();//from  ww w. ja v a  2s.c o m
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:/*from   w ww.j  a  v  a  2s . co  m*/
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) {
    FeatureVector f = new FeatureVector();

    try {/*w ww  .ja  v a  2  s .co  m*/
        int numDocs = reader.numDocs();
        TermsEnum termsEnum = terms.iterator();

        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();

            if (term.length() < 2 || term.length() > 20)
                continue;
            if (!term.matches("[a-z0-9]+"))
                continue;

            // This seemingly arbitrary logic needs some explanation. See following PR for details:
            //   https://github.com/castorini/Anserini/pull/289
            //
            // We have long known that stopwords have a big impact in RM3. If we include stopwords
            // in feedback, effectiveness is affected negatively. In the previous implementation, we
            // built custom stopwords lists by selecting top k terms from the collection. We only
            // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all
            // collections other than Twitter.
            //
            // The logic below instead uses a df threshold: If a term appears in more than n percent
            // of the documents, then it is discarded as a feedback term. This heuristic has the
            // advantage of getting rid of collection-specific stopwords lists, but at the cost of
            // introducing an additional tuning parameter.
            //
            // Cognizant of the dangers of (essentially) tuning on test data, here's what I
            // (@lintool) did:
            //
            // + For newswire collections, I picked a number, 10%, that seemed right. This value
            //   actually increased effectiveness in most conditions across all newswire collections.
            //
            // + This 10% value worked fine on web collections; effectiveness didn't change much.
            //
            // Since this was the first and only heuristic value I selected, we're not really tuning
            // parameters.
            //
            // The 10% threshold, however, doesn't work well on tweets because tweets are much
            // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection,
            // I found a threshold close to a nice round number that approximated the length of the
            // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did
            // this again for the Tweets2013 collection, using the same approach, and obtained a value
            // of 0.7%.
            //
            // With both values, we obtained effectiveness pretty close to the old values with the
            // custom stopwords list.
            int df = reader.docFreq(new Term(FIELD_BODY, term));
            float ratio = (float) df / numDocs;
            if (tweetsearch) {
                if (numDocs > 100000000) { // Probably Tweets2013
                    if (ratio > 0.007f)
                        continue;
                } else {
                    if (ratio > 0.01f)
                        continue;
                }
            } else if (ratio > 0.1f)
                continue;

            int freq = (int) termsEnum.totalTermFreq();
            f.addFeatureWeight(term, (float) freq);
        }
    } catch (Exception e) {
        e.printStackTrace();
        // Return empty feature vector
        return f;
    }

    return f;
}

From source file:io.anserini.util.ExtractTopDfTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//from   w  ww .j  a v  a  2  s.c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    int numDocs = reader.numDocs();

    Comparator<Pair> comp = new Comparator<Pair>() {
        @Override
        public int compare(Pair p1, Pair p2) {
            if (p1.value == p2.value) {
                return p1.key.compareTo(p2.key);
            } else
                return (p1.value < p2.value) ? -1 : 1;
        }
    };

    PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);

    LOG.info("Starting to iterate through all terms...");
    Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int cnt = 0;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        if (term.length() == 0)
            continue;

        Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
        if (queue.size() < myArgs.topK) {
            queue.add(p);
        } else {
            if (comp.compare(p, queue.peek()) > 0) {
                queue.poll();
                queue.add(p);
            }
        }

        cnt++;
        if (cnt % 1000000 == 0) {
            LOG.info("At term " + term);
        }
    }

    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    Pair pair;
    while ((pair = queue.poll()) != null) {
        out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs));
    }
    out.close();

    LOG.info("Done!");
}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc());
    assertEquals(ids.length, reader.numDocs());
    reader.close();// ww  w .  j  av a2 s .c om
}

From source file:io.datalayer.lucene.read.LuceneReaderTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(keywords.length, reader.maxDoc());
    assertEquals(keywords.length, reader.numDocs());
    reader.close();//ww w .  ja  va  2s  . co m
}

From source file:irlucene.CFCRetrieval.java

public ScoreDoc[] query(QueryData queryData, float titleBoost) {
    HashMap<String, Float> boosts;
    MultiFieldQueryParser queryParser;//  w w w  .  ja  va2 s .  c o m
    Query q;
    IndexReader indexReader;
    IndexSearcher indexSearcher;
    TopDocs docs;
    ScoreDoc[] hits = null;
    try {
        boosts = new HashMap<>();
        if (titleBoost != 0) {
            boosts.put("title", titleBoost);
        }
        queryParser = new MultiFieldQueryParser(
                new String[] { "paperNumber", "recordNumber", "acessionNumber", "authors", "title", "source",
                        "majorSubjects", "minorSubjects", "abstractExtract", "references", "citations" },
                analyzer, boosts);
        q = queryParser.parse(queryData.getQuery());
        indexReader = DirectoryReader.open(index);
        indexSearcher = new IndexSearcher(indexReader);
        docs = indexSearcher.search(q, indexReader.numDocs());

        hits = docs.scoreDocs;
        indexReader.close();
    } catch (ParseException | IOException ex) {
        Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex);
    }
    return hits;
}

From source file:irlucene.CFCRetrieval.java

public int numDocs() {
    int numDocs = 0;
    try {/* w  w w  .ja  v a  2 s.  com*/
        IndexReader indexReader = DirectoryReader.open(index);
        numDocs = indexReader.numDocs();
    } catch (IOException ex) {
        Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex);
    }
    return numDocs;
}

From source file:irlucene.MEDRetrieval.java

public ScoreDoc[] query(QueryData queryData) {
    HashMap<String, Float> boosts;
    MultiFieldQueryParser queryParser;//w w  w.ja va2 s  .  com
    Query q;
    IndexReader indexReader;
    IndexSearcher indexSearcher;
    TopDocs docs;
    ScoreDoc[] hits = null;
    try {
        boosts = new HashMap<>();
        queryParser = new MultiFieldQueryParser(new String[] { "id", "content" }, analyzer, boosts);
        q = queryParser.parse(queryData.getQuery());
        indexReader = DirectoryReader.open(index);
        indexSearcher = new IndexSearcher(indexReader);
        docs = indexSearcher.search(q, indexReader.numDocs());

        hits = docs.scoreDocs;
        indexReader.close();
    } catch (ParseException | IOException ex) {
        Logger.getLogger(CFCRetrieval.class.getName()).log(Level.SEVERE, null, ex);
    }
    return hits;
}

From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java

License:Apache License

/**
 * Loads the map containing the conversion from the Wikipedia ids to the
 * Lucene Ids.//from ww w .  ja  v  a  2s .c  om
 */
protected void parseWikiIdToLuceneId() {
    logger.warn("no index wikiID -> lucene found - I'll generate");
    IndexReader reader = getReader();
    wikiIdToLuceneId = new HashMap<Integer, Integer>(reader.numDocs());
    ProgressLogger pl = new ProgressLogger("creating wiki2lucene, readed {} docs", 100000);
    int numDocs = reader.numDocs();
    for (int i = 0; i < numDocs; i++) {
        pl.up();
        try {
            Document doc = reader.document(i);
            IndexableField f = doc.getField(LUCENE_ARTICLE_ID);
            Integer wikiId = new Integer(f.stringValue());
            wikiIdToLuceneId.put(wikiId, i);
        } catch (CorruptIndexException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }

}