Example usage for org.apache.lucene.index LeafReader numDocs

List of usage examples for org.apache.lucene.index LeafReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java

License:Open Source License

@SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS")
@Override//from www.ja v a  2s  .  c o  m
public long getTermFrequency(@NotNull final BytesRef term) {
    // try get a cached value first
    @Nullable
    Long tf = this.cache_tf.get(term);
    if (tf == null) {
        tf = 0L;
        for (final LeafReaderContext lrc : this.index.reader.leaves()) {
            final LeafReader r = lrc.reader();
            long fieldTf = 0L;
            if (r.numDocs() > 0) {
                try {
                    for (final String s : r.fields()) {
                        @Nullable
                        final Terms terms = r.terms(s);
                        if (terms != null) {
                            final TermsEnum termsEnum = terms.iterator(null);
                            if (termsEnum.seekExact(term)) {
                                fieldTf += termsEnum.totalTermFreq();
                            }
                        }
                    }
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            }
            tf += fieldTf;
        }
        this.cache_tf.put(BytesRef.deepCopyOf(term), tf);
    }

    return tf;
}

From source file:de.uni_koeln.spinfo.textengineering.tm.classification.lucene.LuceneAdapter.java

License:Open Source License

private void trainClassifier(String filterQuery) throws IOException {
    Searcher searcher = new Searcher(indexDir);
    LeafReader reader = SlowCompositeReaderWrapper.wrap(searcher.getReader());
    Query q = new TermQuery(new Term("root", filterQuery));
    int totalHits = new IndexSearcher(reader).search(q, reader.numDocs()).totalHits;
    System.out.println("training with " + totalHits + " docs filtered by query: " + q);
    classifier.train(reader, "text", "topic", new StandardAnalyzer(), q);
}

From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java

License:Apache License

public void testIndexAndAddDocValues() throws Exception {
    Random rand = random();//from   w  ww. j a  v  a  2 s.c  o m

    for (int i = 0; i < 100; i++) {
        assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));

        if (rand.nextBoolean()) {
            assertU(commit());
        }
    }

    assertU(commit());

    // Assert everything has been indexed and there are no docvalues
    withNewRawReader(h, topReader -> {
        assertEquals(100, topReader.numDocs());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);

        // The global field type should not have docValues yet
        assertEquals(DocValuesType.NONE, infos.fieldInfo(TEST_FIELD).getDocValuesType());
    });

    addDocValuesTo(h, TEST_FIELD);

    // Add some more documents with doc values turned on including updating some
    for (int i = 90; i < 110; i++) {
        assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));

        if (rand.nextBoolean()) {
            assertU(commit());
        }
    }

    assertU(commit());

    withNewRawReader(h, topReader -> {
        assertEquals(110, topReader.numDocs());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
        // The global field type should have docValues because a document with dvs was added
        assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
    });

    int optimizeSegments = 1;
    assertU(optimize("maxSegments", String.valueOf(optimizeSegments)));

    // Assert all docs have the right docvalues
    withNewRawReader(h, topReader -> {
        // Assert merged into one segment 
        assertEquals(110, topReader.numDocs());
        assertEquals(optimizeSegments, topReader.leaves().size());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
        // The global field type should have docValues because a document with dvs was added
        assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());

        // Check that all segments have the right docvalues type with the correct value
        // Also check that other fields (e.g. the id field) didn't mistakenly get docvalues added
        for (LeafReaderContext ctx : topReader.leaves()) {
            LeafReader r = ctx.reader();
            SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
            for (int i = 0; i < r.numDocs(); ++i) {
                Document doc = r.document(i);
                String v = doc.getField(TEST_FIELD).stringValue();
                String id = doc.getField(ID_FIELD).stringValue();
                assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
                assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());
                assertEquals(v, id);

                docvalues.nextDoc();
                assertEquals(v, docvalues.binaryValue().utf8ToString());
            }
        }
    });
}

From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java

License:Apache License

public void testNonIndexedFieldDoesNonFail() throws Exception {
    // Remove Indexed from fieldType
    removeIndexFrom(h, TEST_FIELD);/*from www  .  j  av a  2s  .  c  om*/

    assertU(adoc(ID_FIELD, String.valueOf(1), TEST_FIELD, String.valueOf(1)));
    assertU(commit());

    addDocValuesTo(h, TEST_FIELD);

    assertU(adoc(ID_FIELD, String.valueOf(2), TEST_FIELD, String.valueOf(2)));
    assertU(commit());

    assertU(optimize("maxSegments", "1"));

    withNewRawReader(h, topReader -> {
        // Assert merged into one segment 
        assertEquals(2, topReader.numDocs());
        assertEquals(1, topReader.leaves().size());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
        // The global field type should have docValues because a document with dvs was added
        assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());

        for (LeafReaderContext ctx : topReader.leaves()) {
            LeafReader r = ctx.reader();
            SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
            for (int i = 0; i < r.numDocs(); ++i) {
                Document doc = r.document(i);
                String v = doc.getField(TEST_FIELD).stringValue();
                String id = doc.getField(ID_FIELD).stringValue();
                assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
                assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());

                if (id.equals("2")) {
                    assertTrue(docvalues.advanceExact(i));
                    assertEquals(v, docvalues.binaryValue().utf8ToString());
                } else {
                    assertFalse(docvalues.advanceExact(i));
                }

            }
        }
    });
}

From source file:org.apache.solr.schema.TestHalfAndHalfDocValues.java

License:Apache License

public void testHalfAndHalfDocValues() throws Exception {
    // Insert two docs without docvalues
    String fieldname = "string_add_dv_later";
    assertU(adoc("id", "3", fieldname, "c"));
    assertU(commit());/*from w  w  w. java 2s. com*/
    assertU(adoc("id", "1", fieldname, "a"));
    assertU(commit());

    try (SolrCore core = h.getCoreInc()) {
        assertFalse(core.getLatestSchema().getField(fieldname).hasDocValues());
        // Add docvalues to the field type
        IndexSchema schema = core.getLatestSchema();
        SchemaField oldField = schema.getField(fieldname);
        int newProperties = oldField.getProperties() | SchemaField.DOC_VALUES;

        SchemaField sf = new SchemaField(fieldname, oldField.getType(), newProperties, null);
        schema.getFields().put(fieldname, sf);

        // Insert a new doc with docvalues
        assertU(adoc("id", "2", fieldname, "b"));
        assertU(commit());

        // Check there are a mix of segments with and without docvalues
        final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true);
        final SolrIndexSearcher searcher = searcherRef.get();
        try {
            final DirectoryReader topReader = searcher.getRawReader();

            //Assert no merges

            assertEquals(3, topReader.numDocs());
            assertEquals(3, topReader.leaves().size());

            final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
            //The global field type should have docValues because a document with dvs was added
            assertEquals(DocValuesType.SORTED, infos.fieldInfo(fieldname).getDocValuesType());

            for (LeafReaderContext ctx : topReader.leaves()) {
                LeafReader r = ctx.reader();
                //Make sure there were no merges
                assertEquals(1, r.numDocs());
                Document doc = r.document(0);
                String id = doc.getField("id").stringValue();

                if (id.equals("1") || id.equals("3")) {
                    assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(fieldname).getDocValuesType());
                } else {
                    assertEquals(DocValuesType.SORTED,
                            r.getFieldInfos().fieldInfo(fieldname).getDocValuesType());
                }

            }
        } finally {
            searcherRef.decref();
        }
    }

    // Assert sort order is correct
    assertQ(req("q", "string_add_dv_later:*", "sort", "string_add_dv_later asc"), "//*[@numFound='3']",
            "//result/doc[1]/int[@name='id'][.=1]", "//result/doc[2]/int[@name='id'][.=2]",
            "//result/doc[3]/int[@name='id'][.=3]");
}

From source file:org.apache.solr.schema.TestPointFields.java

License:Apache License

private void doTestInternals(String field, String[] values) throws IOException {
    assertTrue(h.getCore().getLatestSchema().getField(field).getType() instanceof PointField);
    for (int i = 0; i < 10; i++) {
        assertU(adoc("id", String.valueOf(i), field, values[i]));
    }/*from   w  w  w  .  j  a  v  a 2s  .  c  om*/
    assertU(commit());
    IndexReader ir;
    RefCounted<SolrIndexSearcher> ref = null;
    SchemaField sf = h.getCore().getLatestSchema().getField(field);
    boolean ignoredField = !(sf.indexed() || sf.stored() || sf.hasDocValues());
    try {
        ref = h.getCore().getSearcher();
        SolrIndexSearcher searcher = ref.get();
        ir = searcher.getIndexReader();
        // our own SlowCompositeReader to check DocValues on disk w/o the UninvertingReader added by SolrIndexSearcher
        final LeafReader leafReaderForCheckingDVs = SlowCompositeReaderWrapper.wrap(searcher.getRawReader());

        if (sf.indexed()) {
            assertEquals("Field " + field + " should have point values", 10, PointValues.size(ir, field));
        } else {
            assertEquals("Field " + field + " should have no point values", 0, PointValues.size(ir, field));
        }
        if (ignoredField) {
            assertTrue("Field " + field + " should not have docValues",
                    DocValues.getSortedNumeric(leafReaderForCheckingDVs, field)
                            .nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
            assertTrue("Field " + field + " should not have docValues", DocValues
                    .getNumeric(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
            assertTrue("Field " + field + " should not have docValues", DocValues
                    .getSorted(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
            assertTrue("Field " + field + " should not have docValues", DocValues
                    .getBinary(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
        } else {
            if (sf.hasDocValues()) {
                if (sf.multiValued()) {
                    assertFalse("Field " + field + " should have docValues",
                            DocValues.getSortedNumeric(leafReaderForCheckingDVs, field)
                                    .nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
                } else {
                    assertFalse("Field " + field + " should have docValues",
                            DocValues.getNumeric(leafReaderForCheckingDVs, field)
                                    .nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
                }
            } else {
                expectThrows(IllegalStateException.class,
                        () -> DocValues.getSortedNumeric(leafReaderForCheckingDVs, field));
                expectThrows(IllegalStateException.class,
                        () -> DocValues.getNumeric(leafReaderForCheckingDVs, field));
            }
            expectThrows(IllegalStateException.class,
                    () -> DocValues.getSorted(leafReaderForCheckingDVs, field));
            expectThrows(IllegalStateException.class,
                    () -> DocValues.getBinary(leafReaderForCheckingDVs, field));
        }
        for (LeafReaderContext leave : ir.leaves()) {
            LeafReader reader = leave.reader();
            for (int i = 0; i < reader.numDocs(); i++) {
                Document doc = reader.document(i);
                if (sf.stored()) {
                    assertNotNull("Field " + field + " not found. Doc: " + doc, doc.get(field));
                } else {
                    assertNull(doc.get(field));
                }
            }
        }
    } finally {
        ref.decref();
    }
    clearIndex();
    assertU(commit());
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.DocumentSubsetReaderTests.java

License:Open Source License

public void testLiveDocs() throws Exception {
    int numDocs = scaledRandomIntBetween(16, 128);
    IndexWriter iw = new IndexWriter(directory,
            new IndexWriterConfig(new StandardAnalyzer()).setMergePolicy(NoMergePolicy.INSTANCE));

    for (int i = 0; i < numDocs; i++) {
        Document document = new Document();
        document.add(new StringField("field", "value" + i, Field.Store.NO));
        iw.addDocument(document);/*from   w w w  .  j  a va2  s .c om*/
    }

    iw.forceMerge(1);
    iw.close();

    openDirectoryReader();
    assertThat("should have one segment after force merge", directoryReader.leaves().size(), equalTo(1));

    for (int i = 0; i < numDocs; i++) {
        Query roleQuery = new TermQuery(new Term("field", "value" + i));
        DirectoryReader wrappedReader = DocumentSubsetReader.wrap(directoryReader, bitsetFilterCache,
                roleQuery);

        LeafReader leafReader = wrappedReader.leaves().get(0).reader();
        assertThat(leafReader.hasDeletions(), is(true));
        assertThat(leafReader.numDocs(), equalTo(1));
        Bits liveDocs = leafReader.getLiveDocs();
        assertThat(liveDocs.length(), equalTo(numDocs));
        for (int docId = 0; docId < numDocs; docId++) {
            if (docId == i) {
                assertThat("docId [" + docId + "] should match", liveDocs.get(docId), is(true));
            } else {
                assertThat("docId [" + docId + "] should not match", liveDocs.get(docId), is(false));
            }
        }
    }
}

From source file:org.modeshape.jcr.index.lucene.query.ConstantScoreWeightQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
    Set<String> fieldSet = Collections.singleton(field);
    // return a weight which uses a constant (1.0f) scorer...
    return new RandomAccessWeight(this) {
        @Override/*w  w w. j av  a  2 s .  c  o  m*/
        protected Bits getMatchingDocs(LeafReaderContext context) throws IOException {
            LeafReader leafReader = context.reader();
            Bits liveDocs = leafReader.getLiveDocs();
            // if liveDocs is null it means there are no deleted documents...
            int docsCount = liveDocs != null ? liveDocs.length() : leafReader.numDocs();
            FixedBitSet result = new FixedBitSet(leafReader.maxDoc());
            for (int i = 0; i < docsCount; i++) {
                if (liveDocs != null && !liveDocs.get(i)) {
                    continue;
                }
                Document document = leafReader.document(i, fieldSet);
                IndexableField[] fields = document.getFields(field);
                if (fields.length == 0) {
                    // the document doesn't have the field...
                    continue;
                }
                if (areValid(fields)) {
                    result.set(i);
                }
            }
            return result.cardinality() > 0 ? result : null;
        }
    };
}

From source file:org.voyanttools.trombone.lucene.CorpusMapper.java

License:Open Source License

/**
 * This should not be called, except from the private build() method.
 * @throws IOException/*w  w  w  .  ja v  a2 s  .  c  om*/
 */
private void buildFromTermsEnum() throws IOException {
    LeafReader reader = SlowCompositeReaderWrapper
            .wrap(storage.getLuceneManager().getDirectoryReader(corpus.getId()));

    Terms terms = reader.terms("id");
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    int doc;
    String id;
    Set<String> ids = new HashSet<String>(getCorpusDocumentIds());
    bitSet = new SparseFixedBitSet(reader.numDocs());
    Bits liveBits = reader.getLiveDocs();
    while (bytesRef != null) {
        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
        doc = postingsEnum.nextDoc();
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            id = bytesRef.utf8ToString();
            if (ids.contains(id)) {
                bitSet.set(doc);
                luceneIds.add(doc);
                documentIdToLuceneIdMap.put(id, doc);
                luceneIdToDocumentIdMap.put(doc, id);
            }
        }
        bytesRef = termsEnum.next();
    }
    this.reader = new FilteredCorpusReader(reader, bitSet);
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java

License:Open Source License

private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords)
        throws IOException {
    FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit);
    LeafReader reader = corpusMapper.getLeafReader();
    Corpus corpus = corpusMapper.getCorpus();
    CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType);
    TermsEnum termsEnum = null;//from w  ww . j  av  a2s  .  c o m
    Bits docIdBitSet = corpusMapper
            .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus));
    Bits allBits = new Bits.MatchAllBits(reader.numDocs());
    int[] tokenCounts = corpus.getTokensCounts(tokenType);
    float[] typesCountMeans = corpus.getTypesCountMeans(tokenType);
    float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType);
    for (int doc : corpusMapper.getLuceneIds()) {
        if (!docIdBitSet.get(doc)) {
            continue;
        }
        FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator,
                limit * docIdBitSet.length());
        int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc);
        String docId = corpusMapper.getDocumentIdFromLuceneId(doc);
        float mean = typesCountMeans[documentPosition];
        float stdDev = typesCountStdDev[documentPosition];
        int totalTokensCount = tokenCounts[documentPosition];
        Terms terms = reader.getTermVector(doc, tokenType.name());
        if (terms != null) {
            termsEnum = terms.iterator();
            if (termsEnum != null) {
                BytesRef bytesRef = termsEnum.next();

                while (bytesRef != null) {
                    String termString = bytesRef.utf8ToString();
                    if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) {
                        bytesRef = termsEnum.next();
                        continue;
                    }
                    if (!stopwords.isKeyword(termString)) {
                        CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString);
                        int[] positions = null;
                        int[] offsets = null;
                        int freq;
                        if (isNeedsPositions || isNeedsOffsets) {
                            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
                            postingsEnum.nextDoc();
                            freq = postingsEnum.freq();
                            positions = new int[freq];
                            offsets = new int[freq];
                            for (int i = 0; i < freq; i++) {
                                positions[i] = postingsEnum.nextPosition();
                                offsets[i] = postingsEnum.startOffset();
                            }
                        } else {
                            freq = (int) termsEnum.totalTermFreq();
                        }
                        if (freq >= minRawFreq) {
                            total++;
                            float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN;
                            DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString,
                                    freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal);
                            docQueue.offer(documentTerm);
                        }
                    }
                    bytesRef = termsEnum.next();
                }
            }
        }
        int i = 0;
        for (DocumentTerm docTerm : docQueue.getOrderedList()) {
            queue.offer(docTerm);
            if (++i >= perDocLimit) {
                break;
            }
        }
    }
    corpusTermMinimalsDB.close();
    this.terms.addAll(queue.getOrderedList(start));
}