List of usage examples for org.apache.lucene.index LeafReader numDocs
public abstract int numDocs();
From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java
License:Open Source License
@SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS") @Override//from www.ja v a 2s . c o m public long getTermFrequency(@NotNull final BytesRef term) { // try get a cached value first @Nullable Long tf = this.cache_tf.get(term); if (tf == null) { tf = 0L; for (final LeafReaderContext lrc : this.index.reader.leaves()) { final LeafReader r = lrc.reader(); long fieldTf = 0L; if (r.numDocs() > 0) { try { for (final String s : r.fields()) { @Nullable final Terms terms = r.terms(s); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(term)) { fieldTf += termsEnum.totalTermFreq(); } } } } catch (final IOException e) { throw new UncheckedIOException(e); } } tf += fieldTf; } this.cache_tf.put(BytesRef.deepCopyOf(term), tf); } return tf; }
From source file:de.uni_koeln.spinfo.textengineering.tm.classification.lucene.LuceneAdapter.java
License:Open Source License
private void trainClassifier(String filterQuery) throws IOException { Searcher searcher = new Searcher(indexDir); LeafReader reader = SlowCompositeReaderWrapper.wrap(searcher.getReader()); Query q = new TermQuery(new Term("root", filterQuery)); int totalHits = new IndexSearcher(reader).search(q, reader.numDocs()).totalHits; System.out.println("training with " + totalHits + " docs filtered by query: " + q); classifier.train(reader, "text", "topic", new StandardAnalyzer(), q); }
From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java
License:Apache License
public void testIndexAndAddDocValues() throws Exception { Random rand = random();//from w ww. j a v a 2 s.c o m for (int i = 0; i < 100; i++) { assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i))); if (rand.nextBoolean()) { assertU(commit()); } } assertU(commit()); // Assert everything has been indexed and there are no docvalues withNewRawReader(h, topReader -> { assertEquals(100, topReader.numDocs()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should not have docValues yet assertEquals(DocValuesType.NONE, infos.fieldInfo(TEST_FIELD).getDocValuesType()); }); addDocValuesTo(h, TEST_FIELD); // Add some more documents with doc values turned on including updating some for (int i = 90; i < 110; i++) { assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i))); if (rand.nextBoolean()) { assertU(commit()); } } assertU(commit()); withNewRawReader(h, topReader -> { assertEquals(110, topReader.numDocs()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType()); }); int optimizeSegments = 1; assertU(optimize("maxSegments", String.valueOf(optimizeSegments))); // Assert all docs have the right docvalues withNewRawReader(h, topReader -> { // Assert merged into one segment assertEquals(110, topReader.numDocs()); assertEquals(optimizeSegments, topReader.leaves().size()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType()); // Check that all segments have the right docvalues type with the correct value // Also check that other fields (e.g. the id field) didn't mistakenly get docvalues added for (LeafReaderContext ctx : topReader.leaves()) { LeafReader r = ctx.reader(); SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD); for (int i = 0; i < r.numDocs(); ++i) { Document doc = r.document(i); String v = doc.getField(TEST_FIELD).stringValue(); String id = doc.getField(ID_FIELD).stringValue(); assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType()); assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType()); assertEquals(v, id); docvalues.nextDoc(); assertEquals(v, docvalues.binaryValue().utf8ToString()); } } }); }
From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java
License:Apache License
public void testNonIndexedFieldDoesNonFail() throws Exception { // Remove Indexed from fieldType removeIndexFrom(h, TEST_FIELD);/*from www . j av a 2s . c om*/ assertU(adoc(ID_FIELD, String.valueOf(1), TEST_FIELD, String.valueOf(1))); assertU(commit()); addDocValuesTo(h, TEST_FIELD); assertU(adoc(ID_FIELD, String.valueOf(2), TEST_FIELD, String.valueOf(2))); assertU(commit()); assertU(optimize("maxSegments", "1")); withNewRawReader(h, topReader -> { // Assert merged into one segment assertEquals(2, topReader.numDocs()); assertEquals(1, topReader.leaves().size()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType()); for (LeafReaderContext ctx : topReader.leaves()) { LeafReader r = ctx.reader(); SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD); for (int i = 0; i < r.numDocs(); ++i) { Document doc = r.document(i); String v = doc.getField(TEST_FIELD).stringValue(); String id = doc.getField(ID_FIELD).stringValue(); assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType()); assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType()); if (id.equals("2")) { assertTrue(docvalues.advanceExact(i)); assertEquals(v, docvalues.binaryValue().utf8ToString()); } else { assertFalse(docvalues.advanceExact(i)); } } } }); }
From source file:org.apache.solr.schema.TestHalfAndHalfDocValues.java
License:Apache License
public void testHalfAndHalfDocValues() throws Exception { // Insert two docs without docvalues String fieldname = "string_add_dv_later"; assertU(adoc("id", "3", fieldname, "c")); assertU(commit());/*from w w w. java 2s. com*/ assertU(adoc("id", "1", fieldname, "a")); assertU(commit()); try (SolrCore core = h.getCoreInc()) { assertFalse(core.getLatestSchema().getField(fieldname).hasDocValues()); // Add docvalues to the field type IndexSchema schema = core.getLatestSchema(); SchemaField oldField = schema.getField(fieldname); int newProperties = oldField.getProperties() | SchemaField.DOC_VALUES; SchemaField sf = new SchemaField(fieldname, oldField.getType(), newProperties, null); schema.getFields().put(fieldname, sf); // Insert a new doc with docvalues assertU(adoc("id", "2", fieldname, "b")); assertU(commit()); // Check there are a mix of segments with and without docvalues final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true); final SolrIndexSearcher searcher = searcherRef.get(); try { final DirectoryReader topReader = searcher.getRawReader(); //Assert no merges assertEquals(3, topReader.numDocs()); assertEquals(3, topReader.leaves().size()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); //The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(fieldname).getDocValuesType()); for (LeafReaderContext ctx : topReader.leaves()) { LeafReader r = ctx.reader(); //Make sure there were no merges assertEquals(1, r.numDocs()); Document doc = r.document(0); String id = doc.getField("id").stringValue(); if (id.equals("1") || id.equals("3")) { assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(fieldname).getDocValuesType()); } else { assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(fieldname).getDocValuesType()); } } } finally { searcherRef.decref(); } } // Assert sort order is correct assertQ(req("q", "string_add_dv_later:*", "sort", "string_add_dv_later asc"), "//*[@numFound='3']", "//result/doc[1]/int[@name='id'][.=1]", "//result/doc[2]/int[@name='id'][.=2]", "//result/doc[3]/int[@name='id'][.=3]"); }
From source file:org.apache.solr.schema.TestPointFields.java
License:Apache License
private void doTestInternals(String field, String[] values) throws IOException { assertTrue(h.getCore().getLatestSchema().getField(field).getType() instanceof PointField); for (int i = 0; i < 10; i++) { assertU(adoc("id", String.valueOf(i), field, values[i])); }/*from w w w . j a v a 2s . c om*/ assertU(commit()); IndexReader ir; RefCounted<SolrIndexSearcher> ref = null; SchemaField sf = h.getCore().getLatestSchema().getField(field); boolean ignoredField = !(sf.indexed() || sf.stored() || sf.hasDocValues()); try { ref = h.getCore().getSearcher(); SolrIndexSearcher searcher = ref.get(); ir = searcher.getIndexReader(); // our own SlowCompositeReader to check DocValues on disk w/o the UninvertingReader added by SolrIndexSearcher final LeafReader leafReaderForCheckingDVs = SlowCompositeReaderWrapper.wrap(searcher.getRawReader()); if (sf.indexed()) { assertEquals("Field " + field + " should have point values", 10, PointValues.size(ir, field)); } else { assertEquals("Field " + field + " should have no point values", 0, PointValues.size(ir, field)); } if (ignoredField) { assertTrue("Field " + field + " should not have docValues", DocValues.getSortedNumeric(leafReaderForCheckingDVs, field) .nextDoc() == DocIdSetIterator.NO_MORE_DOCS); assertTrue("Field " + field + " should not have docValues", DocValues .getNumeric(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS); assertTrue("Field " + field + " should not have docValues", DocValues .getSorted(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS); assertTrue("Field " + field + " should not have docValues", DocValues .getBinary(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } else { if (sf.hasDocValues()) { if (sf.multiValued()) { assertFalse("Field " + field + " should have docValues", DocValues.getSortedNumeric(leafReaderForCheckingDVs, field) .nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } else { assertFalse("Field " + field + " should have docValues", DocValues.getNumeric(leafReaderForCheckingDVs, field) .nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } } else { expectThrows(IllegalStateException.class, () -> DocValues.getSortedNumeric(leafReaderForCheckingDVs, field)); expectThrows(IllegalStateException.class, () -> DocValues.getNumeric(leafReaderForCheckingDVs, field)); } expectThrows(IllegalStateException.class, () -> DocValues.getSorted(leafReaderForCheckingDVs, field)); expectThrows(IllegalStateException.class, () -> DocValues.getBinary(leafReaderForCheckingDVs, field)); } for (LeafReaderContext leave : ir.leaves()) { LeafReader reader = leave.reader(); for (int i = 0; i < reader.numDocs(); i++) { Document doc = reader.document(i); if (sf.stored()) { assertNotNull("Field " + field + " not found. Doc: " + doc, doc.get(field)); } else { assertNull(doc.get(field)); } } } } finally { ref.decref(); } clearIndex(); assertU(commit()); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.DocumentSubsetReaderTests.java
License:Open Source License
public void testLiveDocs() throws Exception { int numDocs = scaledRandomIntBetween(16, 128); IndexWriter iw = new IndexWriter(directory, new IndexWriterConfig(new StandardAnalyzer()).setMergePolicy(NoMergePolicy.INSTANCE)); for (int i = 0; i < numDocs; i++) { Document document = new Document(); document.add(new StringField("field", "value" + i, Field.Store.NO)); iw.addDocument(document);/*from w w w . j a va2 s .c om*/ } iw.forceMerge(1); iw.close(); openDirectoryReader(); assertThat("should have one segment after force merge", directoryReader.leaves().size(), equalTo(1)); for (int i = 0; i < numDocs; i++) { Query roleQuery = new TermQuery(new Term("field", "value" + i)); DirectoryReader wrappedReader = DocumentSubsetReader.wrap(directoryReader, bitsetFilterCache, roleQuery); LeafReader leafReader = wrappedReader.leaves().get(0).reader(); assertThat(leafReader.hasDeletions(), is(true)); assertThat(leafReader.numDocs(), equalTo(1)); Bits liveDocs = leafReader.getLiveDocs(); assertThat(liveDocs.length(), equalTo(numDocs)); for (int docId = 0; docId < numDocs; docId++) { if (docId == i) { assertThat("docId [" + docId + "] should match", liveDocs.get(docId), is(true)); } else { assertThat("docId [" + docId + "] should not match", liveDocs.get(docId), is(false)); } } } }
From source file:org.modeshape.jcr.index.lucene.query.ConstantScoreWeightQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { Set<String> fieldSet = Collections.singleton(field); // return a weight which uses a constant (1.0f) scorer... return new RandomAccessWeight(this) { @Override/*w w w. j av a 2 s . c o m*/ protected Bits getMatchingDocs(LeafReaderContext context) throws IOException { LeafReader leafReader = context.reader(); Bits liveDocs = leafReader.getLiveDocs(); // if liveDocs is null it means there are no deleted documents... int docsCount = liveDocs != null ? liveDocs.length() : leafReader.numDocs(); FixedBitSet result = new FixedBitSet(leafReader.maxDoc()); for (int i = 0; i < docsCount; i++) { if (liveDocs != null && !liveDocs.get(i)) { continue; } Document document = leafReader.document(i, fieldSet); IndexableField[] fields = document.getFields(field); if (fields.length == 0) { // the document doesn't have the field... continue; } if (areValid(fields)) { result.set(i); } } return result.cardinality() > 0 ? result : null; } }; }
From source file:org.voyanttools.trombone.lucene.CorpusMapper.java
License:Open Source License
/** * This should not be called, except from the private build() method. * @throws IOException/*w w w . ja v a2 s . c om*/ */ private void buildFromTermsEnum() throws IOException { LeafReader reader = SlowCompositeReaderWrapper .wrap(storage.getLuceneManager().getDirectoryReader(corpus.getId())); Terms terms = reader.terms("id"); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int doc; String id; Set<String> ids = new HashSet<String>(getCorpusDocumentIds()); bitSet = new SparseFixedBitSet(reader.numDocs()); Bits liveBits = reader.getLiveDocs(); while (bytesRef != null) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); doc = postingsEnum.nextDoc(); if (doc != PostingsEnum.NO_MORE_DOCS) { id = bytesRef.utf8ToString(); if (ids.contains(id)) { bitSet.set(doc); luceneIds.add(doc); documentIdToLuceneIdMap.put(id, doc); luceneIdToDocumentIdMap.put(doc, id); } } bytesRef = termsEnum.next(); } this.reader = new FilteredCorpusReader(reader, bitSet); }
From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java
License:Open Source License
private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit); LeafReader reader = corpusMapper.getLeafReader(); Corpus corpus = corpusMapper.getCorpus(); CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType); TermsEnum termsEnum = null;//from w ww . j av a2s . c o m Bits docIdBitSet = corpusMapper .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus)); Bits allBits = new Bits.MatchAllBits(reader.numDocs()); int[] tokenCounts = corpus.getTokensCounts(tokenType); float[] typesCountMeans = corpus.getTypesCountMeans(tokenType); float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType); for (int doc : corpusMapper.getLuceneIds()) { if (!docIdBitSet.get(doc)) { continue; } FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator, limit * docIdBitSet.length()); int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc); String docId = corpusMapper.getDocumentIdFromLuceneId(doc); float mean = typesCountMeans[documentPosition]; float stdDev = typesCountStdDev[documentPosition]; int totalTokensCount = tokenCounts[documentPosition]; Terms terms = reader.getTermVector(doc, tokenType.name()); if (terms != null) { termsEnum = terms.iterator(); if (termsEnum != null) { BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String termString = bytesRef.utf8ToString(); if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) { bytesRef = termsEnum.next(); continue; } if (!stopwords.isKeyword(termString)) { CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString); int[] positions = null; int[] offsets = null; int freq; if (isNeedsPositions || isNeedsOffsets) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); postingsEnum.nextDoc(); freq = postingsEnum.freq(); positions = new int[freq]; offsets = new int[freq]; for (int i = 0; i < freq; i++) { positions[i] = postingsEnum.nextPosition(); offsets[i] = postingsEnum.startOffset(); } } else { freq = (int) termsEnum.totalTermFreq(); } if (freq >= minRawFreq) { total++; float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN; DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString, freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal); docQueue.offer(documentTerm); } } bytesRef = termsEnum.next(); } } } int i = 0; for (DocumentTerm docTerm : docQueue.getOrderedList()) { queue.offer(docTerm); if (++i >= perDocLimit) { break; } } } corpusTermMinimalsDB.close(); this.terms.addAll(queue.getOrderedList(start)); }