Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenDirectoryNewInstance() throws Exception {
    IndexReader reader = IndexReader.open(directory);
    writer.addDocument(doc1);/*from  ww w . ja  v a2s  . c o m*/
    writer.commit();

    assertFalse(reader.isCurrent());
    assertEquals(0, reader.maxDoc());
    reader = IndexReader.open(directory);
    assertTrue(reader.isCurrent());
    assertEquals(1, reader.maxDoc());
}

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenWriter() throws Exception {
    IndexReader reader = IndexReader.open(writer, true);
    assertNotNull(reader);//from ww  w.jav a2s .  co m
    assertSame(directory, reader.directory());
    assertEquals(1, reader.getRefCount());
    assertTrue(reader.isCurrent());
    assertEquals(0, reader.maxDoc());
    close(reader);
    assertEquals(0, reader.getRefCount());
}

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenWriterAddDocument() throws Exception {
    IndexReader reader = IndexReader.open(writer, true);
    writer.addDocument(doc1);/*from w  w w  .ja  v  a  2 s . c o  m*/
    assertFalse(reader.isCurrent());
    assertEquals(1, writer.maxDoc());
    assertEquals(0, reader.maxDoc());
}

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenWriterAddDocumentAndCommit() throws Exception {
    IndexReader reader = IndexReader.open(writer, true);
    writer.addDocument(doc1);//from www.  j ava  2 s . co m
    writer.commit();
    assertFalse(reader.isCurrent());
    assertEquals(1, writer.maxDoc());
    assertEquals(0, reader.maxDoc());
}

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenWriterReopen() throws Exception {
    IndexReader reader = IndexReader.open(writer, true);
    writer.addDocument(doc1);/*from   ww w  .j  av  a2  s .com*/
    writer.commit();

    assertFalse(reader.isCurrent());
    assertEquals(0, reader.maxDoc());
    reader = TestLARQUtils.openIfChanged(reader);
    assertTrue(reader.isCurrent());
    assertEquals(1, reader.maxDoc());
}

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenWriterNewInstance() throws Exception {
    IndexReader reader = IndexReader.open(writer, true);
    writer.addDocument(doc1);//from  w w  w .  j  av a  2s  .com
    writer.commit();

    assertFalse(reader.isCurrent());
    assertEquals(0, reader.maxDoc());
    reader = IndexReader.open(writer, true);
    assertTrue(reader.isCurrent());
    assertEquals(1, reader.maxDoc());
}

From source file:org.apache.jena.larq.TestLuceneNRT.java

License:Apache License

@Test
public void indexReaderOpenWriterDeleteDocumentAndCommit() throws Exception {
    Term term = new Term("foo", "bar1");
    IndexReader reader = IndexReader.open(writer, false);
    writer.addDocument(doc1);//from   ww w. j  a  v  a 2 s.  c  om
    writer.addDocument(doc2);
    writer.commit();

    assertEquals(2, writer.maxDoc());
    assertFalse(reader.isCurrent());
    assertEquals(0, reader.maxDoc());
    assertEquals(0, count(reader, term));

    reader = TestLARQUtils.openIfChanged(reader);
    assertTrue(reader.isCurrent());
    assertEquals(1, count(reader, term));

    writer.deleteDocuments(term);
    assertEquals(2, writer.maxDoc());
    assertEquals(1, count(reader, term));

    reader = TestLARQUtils.openIfChanged(reader);
    assertEquals(1, count(reader, term));

    writer.commit();
    reader = TestLARQUtils.openIfChanged(reader);
    assertEquals(2, writer.maxDoc());
    assertEquals(0, count(reader, term));

    reader = TestLARQUtils.openIfChanged(reader);
    assertEquals(0, count(reader, term));

    writer.forceMergeDeletes();
    assertEquals(1, writer.maxDoc());
    assertEquals(0, count(reader, term));
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

License:Apache License

/**
 * Get the list of labels, sorted by best score.
 *//*  w  w w. ja  va  2s  . c o  m*/
protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
        Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {

    if (wpvws.size() < minNumIds) {
        log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
        return null;
    }

    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
    Directory dir = FSDirectory.open(new File(this.indexDir));
    IndexReader reader = DirectoryReader.open(dir);

    log.info("# of documents in the index {}", reader.numDocs());

    Collection<String> idSet = Sets.newHashSet();
    for (WeightedPropertyVectorWritable wpvw : wpvws) {
        Vector vector = wpvw.getVector();
        if (vector instanceof NamedVector) {
            idSet.add(((NamedVector) vector).getName());
        }
    }

    int numDocs = reader.numDocs();

    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);

    log.info("Populating term infos from the index");

    /**
     * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
     * 
     * Since we have deleted the documents out of the cluster, the document frequency for a term should only
     * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
     * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
     * frequencies in each document. The number of results of this call will be the in-cluster document
     * frequency.
     */
    Terms t = MultiFields.getTerms(reader, contentField);
    TermsEnum te = t.iterator(null);
    Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions

    int count = 0;
    BytesRef term;
    while ((term = te.next()) != null) {
        OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
        DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
        int docID;
        while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //check to see if we don't have an deletions (null) or if document is live
            if (liveDocs != null && !liveDocs.get(docID)) {
                // document is deleted...
                termBitset.set(docsEnum.docID());
            }
        }
        // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
        // This modifies the termBitset, but that's fine as we are not using it anywhere else.
        termBitset.and(clusterDocBitset);
        int inclusterDF = (int) termBitset.cardinality();

        TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
        termEntryMap.put(entry.getTerm(), entry);

    }

    List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList();

    int clusterSize = wpvws.size();

    for (TermEntry termEntry : termEntryMap.values()) {

        int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm()));
        int outDF = corpusDF - termEntry.getDocFreq();
        int inDF = termEntry.getDocFreq();
        double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
        TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF,
                logLikelihoodRatio);
        clusteredTermInfo.add(termInfoCluster);
    }

    Collections.sort(clusteredTermInfo);
    // Cleanup
    Closeables.close(reader, true);
    termEntryMap.clear();

    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
}

From source file:org.apache.maven.index.AbstractRepoNexusIndexerTest.java

License:Apache License

public void testPackaging() throws Exception {
    IndexReader reader = context.acquireIndexSearcher().getIndexReader();

    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (liveDocs == null || liveDocs.get(i)) {
            Document document = reader.document(i);

            String uinfo = document.get(ArtifactInfo.UINFO);

            if (uinfo != null) {
                String info = document.get(ArtifactInfo.INFO);
                assertFalse("Bad:" + info, info.startsWith("null"));
            }/*ww w . j ava2s  .c  om*/
        }
    }

    // {
    // Query query = new TermQuery( new Term( MAVEN.PACKAGING, "jar" ) );
    // FlatSearchResponse response = nexusIndexer.searchFlat(new FlatSearchRequest(query));
    // assertEquals(response.getResults().toString(), 22, response.getTotalHits());
    // }
    {
        Query query = nexusIndexer.constructQuery(MAVEN.PACKAGING, "tar.gz", SearchType.EXACT);
        FlatSearchResponse response = nexusIndexer.searchFlat(new FlatSearchRequest(query));
        assertEquals(response.getResults().toString(), 1, response.getTotalHits());

        ArtifactInfo ai = response.getResults().iterator().next();
        assertEquals("tar.gz", ai.getPackaging());
        assertEquals("tar.gz", ai.getFileExtension());
    }
    {
        Query query = nexusIndexer.constructQuery(MAVEN.PACKAGING, "zip", SearchType.EXACT);
        FlatSearchResponse response = nexusIndexer.searchFlat(new FlatSearchRequest(query));
        assertEquals(response.getResults().toString(), 1, response.getTotalHits());

        ArtifactInfo ai = response.getResults().iterator().next();
        assertEquals("zip", ai.getPackaging());
        assertEquals("zip", ai.getFileExtension());
    }
}

From source file:org.apache.maven.index.context.DefaultIndexingContext.java

License:Apache License

public synchronized void merge(Directory directory, DocumentFilter filter) throws IOException {
    final IndexSearcher s = acquireIndexSearcher();
    try {//from  w  w  w  .j a  v  a2s .c om
        final IndexWriter w = getIndexWriter();
        final IndexReader directoryReader = DirectoryReader.open(directory);
        TopScoreDocCollector collector = null;
        try {
            int numDocs = directoryReader.maxDoc();

            Bits liveDocs = MultiFields.getLiveDocs(directoryReader);
            for (int i = 0; i < numDocs; i++) {
                if (liveDocs != null && !liveDocs.get(i)) {
                    continue;
                }

                Document d = directoryReader.document(i);
                if (filter != null && !filter.accept(d)) {
                    continue;
                }

                String uinfo = d.get(ArtifactInfo.UINFO);
                if (uinfo != null) {
                    collector = TopScoreDocCollector.create(1);
                    s.search(new TermQuery(new Term(ArtifactInfo.UINFO, uinfo)), collector);
                    if (collector.getTotalHits() == 0) {
                        w.addDocument(IndexUtils.updateDocument(d, this, false));
                    }
                } else {
                    String deleted = d.get(ArtifactInfo.DELETED);

                    if (deleted != null) {
                        // Deleting the document loses history that it was delete,
                        // so incrementals wont work. Therefore, put the delete
                        // document in as well
                        w.deleteDocuments(new Term(ArtifactInfo.UINFO, deleted));
                        w.addDocument(d);
                    }
                }
            }

        } finally {
            directoryReader.close();
            commit();
        }

        rebuildGroups();
        Date mergedTimestamp = IndexUtils.getTimestamp(directory);

        if (getTimestamp() != null && mergedTimestamp != null && mergedTimestamp.after(getTimestamp())) {
            // we have both, keep the newest
            updateTimestamp(true, mergedTimestamp);
        } else {
            updateTimestamp(true);
        }
        optimize();
    } finally {
        releaseIndexSearcher(s);
    }
}