Example usage for org.apache.lucene.index LeafReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java

License:Open Source License

@SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS")
@Override//from www.ja v a  2s  .  c o  m
public long getTermFrequency(@NotNull final BytesRef term) {
    // try get a cached value first
    @Nullable
    Long tf = this.cache_tf.get(term);
    if (tf == null) {
        tf = 0L;
        for (final LeafReaderContext lrc : this.index.reader.leaves()) {
            final LeafReader r = lrc.reader();
            long fieldTf = 0L;
            if (r.numDocs() > 0) {
                try {
                    for (final String s : r.fields()) {
                        @Nullable
                        final Terms terms = r.terms(s);
                        if (terms != null) {
                            final TermsEnum termsEnum = terms.iterator(null);
                            if (termsEnum.seekExact(term)) {
                                fieldTf += termsEnum.totalTermFreq();
                            }
                        }
                    }
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            }
            tf += fieldTf;
        }
        this.cache_tf.put(BytesRef.deepCopyOf(term), tf);
    }

    return tf;
}

From source file:de.uni_koeln.spinfo.textengineering.tm.classification.lucene.LuceneAdapter.java

License:Open Source License

private void trainClassifier(String filterQuery) throws IOException {
    Searcher searcher = new Searcher(indexDir);
    LeafReader reader = SlowCompositeReaderWrapper.wrap(searcher.getReader());
    Query q = new TermQuery(new Term("root", filterQuery));
    int totalHits = new IndexSearcher(reader).search(q, reader.numDocs()).totalHits;
    System.out.println("training with " + totalHits + " docs filtered by query: " + q);
    classifier.train(reader, "text", "topic", new StandardAnalyzer(), q);
}

From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java

License:Apache License

public void testIndexAndAddDocValues() throws Exception {
    Random rand = random();//from   w  ww. j a  v  a  2 s.c  o m

    for (int i = 0; i < 100; i++) {
        assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));

        if (rand.nextBoolean()) {
            assertU(commit());
        }
    }

    assertU(commit());

    // Assert everything has been indexed and there are no docvalues
    withNewRawReader(h, topReader -> {
        assertEquals(100, topReader.numDocs());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);

        // The global field type should not have docValues yet
        assertEquals(DocValuesType.NONE, infos.fieldInfo(TEST_FIELD).getDocValuesType());
    });

    addDocValuesTo(h, TEST_FIELD);

    // Add some more documents with doc values turned on including updating some
    for (int i = 90; i < 110; i++) {
        assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i)));

        if (rand.nextBoolean()) {
            assertU(commit());
        }
    }

    assertU(commit());

    withNewRawReader(h, topReader -> {
        assertEquals(110, topReader.numDocs());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
        // The global field type should have docValues because a document with dvs was added
        assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());
    });

    int optimizeSegments = 1;
    assertU(optimize("maxSegments", String.valueOf(optimizeSegments)));

    // Assert all docs have the right docvalues
    withNewRawReader(h, topReader -> {
        // Assert merged into one segment 
        assertEquals(110, topReader.numDocs());
        assertEquals(optimizeSegments, topReader.leaves().size());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
        // The global field type should have docValues because a document with dvs was added
        assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());

        // Check that all segments have the right docvalues type with the correct value
        // Also check that other fields (e.g. the id field) didn't mistakenly get docvalues added
        for (LeafReaderContext ctx : topReader.leaves()) {
            LeafReader r = ctx.reader();
            SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
            for (int i = 0; i < r.numDocs(); ++i) {
                Document doc = r.document(i);
                String v = doc.getField(TEST_FIELD).stringValue();
                String id = doc.getField(ID_FIELD).stringValue();
                assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
                assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());
                assertEquals(v, id);

                docvalues.nextDoc();
                assertEquals(v, docvalues.binaryValue().utf8ToString());
            }
        }
    });
}

From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java

License:Apache License

public void testNonIndexedFieldDoesNonFail() throws Exception {
    // Remove Indexed from fieldType
    removeIndexFrom(h, TEST_FIELD);/*from www  .  j  av a  2s  .  c  om*/

    assertU(adoc(ID_FIELD, String.valueOf(1), TEST_FIELD, String.valueOf(1)));
    assertU(commit());

    addDocValuesTo(h, TEST_FIELD);

    assertU(adoc(ID_FIELD, String.valueOf(2), TEST_FIELD, String.valueOf(2)));
    assertU(commit());

    assertU(optimize("maxSegments", "1"));

    withNewRawReader(h, topReader -> {
        // Assert merged into one segment 
        assertEquals(2, topReader.numDocs());
        assertEquals(1, topReader.leaves().size());

        final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
        // The global field type should have docValues because a document with dvs was added
        assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType());

        for (LeafReaderContext ctx : topReader.leaves()) {
            LeafReader r = ctx.reader();
            SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD);
            for (int i = 0; i < r.numDocs(); ++i) {
                Document doc = r.document(i);
                String v = doc.getField(TEST_FIELD).stringValue();
                String id = doc.getField(ID_FIELD).stringValue();
                assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType());
                assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType());

                if (id.equals("2")) {
                    assertTrue(docvalues.advanceExact(i));
                    assertEquals(v, docvalues.binaryValue().utf8ToString());
                } else {
                    assertFalse(docvalues.advanceExact(i));
                }

            }
        }
    });
}

From source file:org.apache.solr.schema.TestHalfAndHalfDocValues.java

License:Apache License

public void testHalfAndHalfDocValues() throws Exception {
    // Insert two docs without docvalues
    String fieldname = "string_add_dv_later";
    assertU(adoc("id", "3", fieldname, "c"));
    assertU(commit());/*from w  w  w. java 2s. com*/
    assertU(adoc("id", "1", fieldname, "a"));
    assertU(commit());

    try (SolrCore core = h.getCoreInc()) {
        assertFalse(core.getLatestSchema().getField(fieldname).hasDocValues());
        // Add docvalues to the field type
        IndexSchema schema = core.getLatestSchema();
        SchemaField oldField = schema.getField(fieldname);
        int newProperties = oldField.getProperties() | SchemaField.DOC_VALUES;

        SchemaField sf = new SchemaField(fieldname, oldField.getType(), newProperties, null);
        schema.getFields().put(fieldname, sf);

        // Insert a new doc with docvalues
        assertU(adoc("id", "2", fieldname, "b"));
        assertU(commit());

        // Check there are a mix of segments with and without docvalues
        final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true);
        final SolrIndexSearcher searcher = searcherRef.get();
        try {
            final DirectoryReader topReader = searcher.getRawReader();

            //Assert no merges

            assertEquals(3, topReader.numDocs());
            assertEquals(3, topReader.leaves().size());

            final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader);
            //The global field type should have docValues because a document with dvs was added
            assertEquals(DocValuesType.SORTED, infos.fieldInfo(fieldname).getDocValuesType());

            for (LeafReaderContext ctx : topReader.leaves()) {
                LeafReader r = ctx.reader();
                //Make sure there were no merges
                assertEquals(1, r.numDocs());
                Document doc = r.document(0);
                String id = doc.getField("id").stringValue();

                if (id.equals("1") || id.equals("3")) {
                    assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(fieldname).getDocValuesType());
                } else {
                    assertEquals(DocValuesType.SORTED,
                            r.getFieldInfos().fieldInfo(fieldname).getDocValuesType());
                }

            }
        } finally {
            searcherRef.decref();
        }
    }

    // Assert sort order is correct
    assertQ(req("q", "string_add_dv_later:*", "sort", "string_add_dv_later asc"), "//*[@numFound='3']",
            "//result/doc[1]/int[@name='id'][.=1]", "//result/doc[2]/int[@name='id'][.=2]",
            "//result/doc[3]/int[@name='id'][.=3]");
}

From source file:org.apache.solr.schema.TestPointFields.java

License:Apache License

private void doTestInternals(String field, String[] values) throws IOException {
    assertTrue(h.getCore().getLatestSchema().getField(field).getType() instanceof PointField);
    for (int i = 0; i < 10; i++) {
        assertU(adoc("id", String.valueOf(i), field, values[i]));
    }/*from   w  w  w  .  j  a  v  a 2s  .  c  om*/
    assertU(commit());
    IndexReader ir;
    RefCounted<SolrIndexSearcher> ref = null;
    SchemaField sf = h.getCore().getLatestSchema().getField(field);
    boolean ignoredField = !(sf.indexed() || sf.stored() || sf.hasDocValues());
    try {
        ref = h.getCore().getSearcher();
        SolrIndexSearcher searcher = ref.get();
        ir = searcher.getIndexReader();
        // our own SlowCompositeReader to check DocValues on disk w/o the UninvertingReader added by SolrIndexSearcher
        final LeafReader leafReaderForCheckingDVs = SlowCompositeReaderWrapper.wrap(searcher.getRawReader());

        if (sf.indexed()) {
            assertEquals("Field " + field + " should have point values", 10, PointValues.size(ir, field));
        } else {
            assertEquals("Field " + field + " should have no point values", 0, PointValues.size(ir, field));
        }
        if (ignoredField) {
            assertTrue("Field " + field + " should not have docValues",
                    DocValues.getSortedNumeric(leafReaderForCheckingDVs, field)
                            .nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
            assertTrue("Field " + field + " should not have docValues", DocValues
                    .getNumeric(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
            assertTrue("Field " + field + " should not have docValues", DocValues
                    .getSorted(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
            assertTrue("Field " + field + " should not have docValues", DocValues
                    .getBinary(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
        } else {
            if (sf.hasDocValues()) {
                if (sf.multiValued()) {
                    assertFalse("Field " + field + " should have docValues",
                            DocValues.getSortedNumeric(leafReaderForCheckingDVs, field)
                                    .nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
                } else {
                    assertFalse("Field " + field + " should have docValues",
                            DocValues.getNumeric(leafReaderForCheckingDVs, field)
                                    .nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
                }
            } else {
                expectThrows(IllegalStateException.class,
                        () -> DocValues.getSortedNumeric(leafReaderForCheckingDVs, field));
                expectThrows(IllegalStateException.class,
                        () -> DocValues.getNumeric(leafReaderForCheckingDVs, field));
            }
            expectThrows(IllegalStateException.class,
                    () -> DocValues.getSorted(leafReaderForCheckingDVs, field));
            expectThrows(IllegalStateException.class,
                    () -> DocValues.getBinary(leafReaderForCheckingDVs, field));
        }
        for (LeafReaderContext leave : ir.leaves()) {
            LeafReader reader = leave.reader();
            for (int i = 0; i < reader.numDocs(); i++) {
                Document doc = reader.document(i);
                if (sf.stored()) {
                    assertNotNull("Field " + field + " not found. Doc: " + doc, doc.get(field));
                } else {
                    assertNull(doc.get(field));
                }
            }
        }
    } finally {
        ref.decref();
    }
    clearIndex();
    assertU(commit());
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.DocumentSubsetReaderTests.java

License:Open Source License

public void testLiveDocs() throws Exception {
    int numDocs = scaledRandomIntBetween(16, 128);
    IndexWriter iw = new IndexWriter(directory,
            new IndexWriterConfig(new StandardAnalyzer()).setMergePolicy(NoMergePolicy.INSTANCE));

    for (int i = 0; i < numDocs; i++) {
        Document document = new Document();
        document.add(new StringField("field", "value" + i, Field.Store.NO));
        iw.addDocument(document);/*from   w w w  .  j  a va2  s .c om*/
    }

    iw.forceMerge(1);
    iw.close();

    openDirectoryReader();
    assertThat("should have one segment after force merge", directoryReader.leaves().size(), equalTo(1));

    for (int i = 0; i < numDocs; i++) {
        Query roleQuery = new TermQuery(new Term("field", "value" + i));
        DirectoryReader wrappedReader = DocumentSubsetReader.wrap(directoryReader, bitsetFilterCache,
                roleQuery);

        LeafReader leafReader = wrappedReader.leaves().get(0).reader();
        assertThat(leafReader.hasDeletions(), is(true));
        assertThat(leafReader.numDocs(), equalTo(1));
        Bits liveDocs = leafReader.getLiveDocs();
        assertThat(liveDocs.length(), equalTo(numDocs));
        for (int docId = 0; docId < numDocs; docId++) {
            if (docId == i) {
                assertThat("docId [" + docId + "] should match", liveDocs.get(docId), is(true));
            } else {
                assertThat("docId [" + docId + "] should not match", liveDocs.get(docId), is(false));
            }
        }
    }
}

From source file:org.modeshape.jcr.index.lucene.query.ConstantScoreWeightQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
    Set<String> fieldSet = Collections.singleton(field);
    // return a weight which uses a constant (1.0f) scorer...
    return new RandomAccessWeight(this) {
        @Override/*w  w w. j av  a  2 s .  c  o  m*/
        protected Bits getMatchingDocs(LeafReaderContext context) throws IOException {
            LeafReader leafReader = context.reader();
            Bits liveDocs = leafReader.getLiveDocs();
            // if liveDocs is null it means there are no deleted documents...
            int docsCount = liveDocs != null ? liveDocs.length() : leafReader.numDocs();
            FixedBitSet result = new FixedBitSet(leafReader.maxDoc());
            for (int i = 0; i < docsCount; i++) {
                if (liveDocs != null && !liveDocs.get(i)) {
                    continue;
                }
                Document document = leafReader.document(i, fieldSet);
                IndexableField[] fields = document.getFields(field);
                if (fields.length == 0) {
                    // the document doesn't have the field...
                    continue;
                }
                if (areValid(fields)) {
                    result.set(i);
                }
            }
            return result.cardinality() > 0 ? result : null;
        }
    };
}

From source file:org.voyanttools.trombone.lucene.CorpusMapper.java

License:Open Source License

/**
 * This should not be called, except from the private build() method.
 * @throws IOException/*w  w  w  .  ja v  a2 s  .  c  om*/
 */
private void buildFromTermsEnum() throws IOException {
    LeafReader reader = SlowCompositeReaderWrapper
            .wrap(storage.getLuceneManager().getDirectoryReader(corpus.getId()));

    Terms terms = reader.terms("id");
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    int doc;
    String id;
    Set<String> ids = new HashSet<String>(getCorpusDocumentIds());
    bitSet = new SparseFixedBitSet(reader.numDocs());
    Bits liveBits = reader.getLiveDocs();
    while (bytesRef != null) {
        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
        doc = postingsEnum.nextDoc();
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            id = bytesRef.utf8ToString();
            if (ids.contains(id)) {
                bitSet.set(doc);
                luceneIds.add(doc);
                documentIdToLuceneIdMap.put(id, doc);
                luceneIdToDocumentIdMap.put(doc, id);
            }
        }
        bytesRef = termsEnum.next();
    }
    this.reader = new FilteredCorpusReader(reader, bitSet);
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java

License:Open Source License

private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords)
        throws IOException {
    FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit);
    LeafReader reader = corpusMapper.getLeafReader();
    Corpus corpus = corpusMapper.getCorpus();
    CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType);
    TermsEnum termsEnum = null;//from w  ww . j  av  a2s  .  c o m
    Bits docIdBitSet = corpusMapper
            .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus));
    Bits allBits = new Bits.MatchAllBits(reader.numDocs());
    int[] tokenCounts = corpus.getTokensCounts(tokenType);
    float[] typesCountMeans = corpus.getTypesCountMeans(tokenType);
    float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType);
    for (int doc : corpusMapper.getLuceneIds()) {
        if (!docIdBitSet.get(doc)) {
            continue;
        }
        FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator,
                limit * docIdBitSet.length());
        int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc);
        String docId = corpusMapper.getDocumentIdFromLuceneId(doc);
        float mean = typesCountMeans[documentPosition];
        float stdDev = typesCountStdDev[documentPosition];
        int totalTokensCount = tokenCounts[documentPosition];
        Terms terms = reader.getTermVector(doc, tokenType.name());
        if (terms != null) {
            termsEnum = terms.iterator();
            if (termsEnum != null) {
                BytesRef bytesRef = termsEnum.next();

                while (bytesRef != null) {
                    String termString = bytesRef.utf8ToString();
                    if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) {
                        bytesRef = termsEnum.next();
                        continue;
                    }
                    if (!stopwords.isKeyword(termString)) {
                        CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString);
                        int[] positions = null;
                        int[] offsets = null;
                        int freq;
                        if (isNeedsPositions || isNeedsOffsets) {
                            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
                            postingsEnum.nextDoc();
                            freq = postingsEnum.freq();
                            positions = new int[freq];
                            offsets = new int[freq];
                            for (int i = 0; i < freq; i++) {
                                positions[i] = postingsEnum.nextPosition();
                                offsets[i] = postingsEnum.startOffset();
                            }
                        } else {
                            freq = (int) termsEnum.totalTermFreq();
                        }
                        if (freq >= minRawFreq) {
                            total++;
                            float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN;
                            DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString,
                                    freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal);
                            docQueue.offer(documentTerm);
                        }
                    }
                    bytesRef = termsEnum.next();
                }
            }
        }
        int i = 0;
        for (DocumentTerm docTerm : docQueue.getOrderedList()) {
            queue.offer(docTerm);
            if (++i >= perDocLimit) {
                break;
            }
        }
    }
    corpusTermMinimalsDB.close();
    this.terms.addAll(queue.getOrderedList(start));
}