List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:org.neo4j.kernel.api.impl.schema.sampler.NonUniqueLuceneIndexSampler.java
License:Open Source License
@Override protected IndexSample performSampling() throws IndexNotFoundKernelException { NonUniqueIndexSampler sampler = new NonUniqueIndexSampler(indexSamplingConfig.sampleSizeLimit()); IndexReader indexReader = indexSearcher.getIndexReader(); for (LeafReaderContext readerContext : indexReader.leaves()) { try {/* ww w.ja v a 2 s . co m*/ Set<String> fieldNames = getFieldNamesToSample(readerContext); for (String fieldName : fieldNames) { Terms terms = readerContext.reader().terms(fieldName); if (terms != null) { TermsEnum termsEnum = LuceneDocumentStructure.originalTerms(terms, fieldName); BytesRef termsRef; while ((termsRef = termsEnum.next()) != null) { sampler.include(termsRef.utf8ToString(), termsEnum.docFreq()); checkCancellation(); } } } } catch (IOException e) { throw new RuntimeException(e); } } return sampler.result(indexReader.numDocs()); }
From source file:org.netbeans.modules.jackpot30.indexer.usages.IndexerImplTest.java
License:Open Source License
public void testRepeatedIndexing() throws IOException { final FileObject root = FileUtil.toFileObject(getWorkDir()); FileObject testFile = FileUtil.createData(root, "Test.java"); copyToFile(testFile, "public class Test {}"); Directory store = new RAMDirectory(); IndexWriter iw = new IndexWriter(store, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); IndexAccessor.current = new IndexAccessor(iw, root); iw.addDocument(fakeDocument(testFile)); doIndex(root, testFile);/*from w ww . j a va2 s . c o m*/ iw.close(); IndexReader ir = IndexReader.open(store); int expectedDocumentsCount = ir.numDocs(); assertEquals(3 + 1, expectedDocumentsCount); store = new RAMDirectory(); iw = new IndexWriter(store, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); IndexAccessor.current = new IndexAccessor(iw, root); iw.addDocument(fakeDocument(testFile)); doIndex(root, testFile); doIndex(root, testFile); iw.close(); ir = IndexReader.open(store); assertEquals(expectedDocumentsCount, ir.numDocs()); }
From source file:org.ninit.models.bm25.BM25BooleanScorer.java
License:Apache License
public BM25BooleanScorer(IndexReader reader, BooleanTermQuery[] should, BooleanTermQuery[] must, BooleanTermQuery[] not, Similarity similarity) throws IOException { super(similarity); this.ndocs = reader.numDocs(); if (should != null && should.length > 0) { Scorer[] shouldScorer = new Scorer[should.length]; for (int i = 0; i < shouldScorer.length; i++) { shouldScorer[i] = new BM25TermScorer(reader, should[i].termQuery, similarity); }//from w w w. j a v a2 s . co m this.shouldBooleanScorer = new ShouldBooleanScorer(similarity, shouldScorer); } else this.shouldBooleanScorer = new MatchAllBooleanScorer(similarity, this.ndocs); if (must != null && must.length > 0) { Scorer[] mustScorer = new Scorer[must.length]; for (int i = 0; i < mustScorer.length; i++) { mustScorer[i] = new BM25TermScorer(reader, must[i].termQuery, similarity); } this.mustBooleanScorer = new MustBooleanScorer(similarity, mustScorer); } else this.mustBooleanScorer = new MatchAllBooleanScorer(similarity, this.ndocs); if (not != null && not.length > 0) { Scorer[] notScorer = new Scorer[not.length]; for (int i = 0; i < notScorer.length; i++) { notScorer[i] = new BM25TermScorer(reader, not[i].termQuery, similarity); } this.notBooleanScorer = new NotBooleanScorer(similarity, notScorer, this.ndocs); } else this.notBooleanScorer = new MatchAllBooleanScorer(similarity, this.ndocs); }
From source file:org.ninit.models.bm25.BM25BooleanScorer.java
License:Apache License
public BM25BooleanScorer(IndexReader reader, BooleanTermQuery[] should, BooleanTermQuery[] must, BooleanTermQuery[] not, Similarity similarity, String[] fields, float[] boosts, float[] bParams) throws IOException { super(similarity); this.ndocs = reader.numDocs(); if (should != null && should.length > 0) { Scorer[] shouldScorer = new Scorer[should.length]; for (int i = 0; i < shouldScorer.length; i++) { shouldScorer[i] = new BM25FTermScorer(reader, should[i].termQuery, fields, boosts, bParams, similarity);//from ww w .j a va 2 s. co m } this.shouldBooleanScorer = new ShouldBooleanScorer(similarity, shouldScorer); } else this.shouldBooleanScorer = new MatchAllBooleanScorer(similarity, this.ndocs); if (must != null && must.length > 0) { Scorer[] mustScorer = new Scorer[must.length]; for (int i = 0; i < mustScorer.length; i++) { mustScorer[i] = new BM25FTermScorer(reader, must[i].termQuery, fields, boosts, bParams, similarity); } this.mustBooleanScorer = new MustBooleanScorer(similarity, mustScorer); } else this.mustBooleanScorer = new MatchAllBooleanScorer(similarity, this.ndocs); if (not != null && not.length > 0) { Scorer[] notScorer = new Scorer[not.length]; for (int i = 0; i < notScorer.length; i++) { notScorer[i] = new BM25FTermScorer(reader, not[i].termQuery, fields, boosts, bParams, similarity); } this.notBooleanScorer = new NotBooleanScorer(similarity, notScorer, this.ndocs); } else this.notBooleanScorer = new MatchAllBooleanScorer(similarity, this.ndocs); }
From source file:org.ninit.models.bm25.BM25SingleBooleanScorer.java
License:Apache License
public BM25SingleBooleanScorer(IndexReader reader, BooleanTermQuery[] termQuery, Similarity similarity) throws IOException { super(similarity); Scorer[] scorer = new Scorer[termQuery.length]; for (int i = 0; i < scorer.length; i++) { scorer[i] = new BM25TermScorer(reader, termQuery[i].termQuery, similarity); }// w w w . ja v a 2 s.c o m if (termQuery[0].occur == BooleanClause.Occur.MUST) this.booleanScorer = new MustBooleanScorer(similarity, scorer); else if (termQuery[0].occur == BooleanClause.Occur.SHOULD) this.booleanScorer = new ShouldBooleanScorer(similarity, scorer); else this.booleanScorer = new NotBooleanScorer(similarity, scorer, reader.numDocs()); }
From source file:org.ninit.models.bm25.BM25SingleBooleanScorer.java
License:Apache License
public BM25SingleBooleanScorer(IndexReader reader, BooleanTermQuery[] termQuery, Similarity similarity, String[] fields, float[] boosts, float[] bParams) throws IOException { super(similarity); Scorer[] scorer = new Scorer[termQuery.length]; for (int i = 0; i < scorer.length; i++) { scorer[i] = new BM25FTermScorer(reader, termQuery[i].termQuery, fields, boosts, bParams, similarity); }/*from w w w . jav a 2 s. c o m*/ if (termQuery[0].occur == BooleanClause.Occur.MUST) this.booleanScorer = new MustBooleanScorer(similarity, scorer); else if (termQuery[0].occur == BooleanClause.Occur.SHOULD) this.booleanScorer = new ShouldBooleanScorer(similarity, scorer); else this.booleanScorer = new NotBooleanScorer(similarity, scorer, reader.numDocs()); }
From source file:org.ninit.models.bm25.BM25TermScorer.java
License:Apache License
public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity) throws IOException { super(similarity); this.reader = reader; this.term = term; this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs()); this.norm = this.reader.norms(this.term.getTerm().field()); this.av_length = BM25Parameters.getAverageLength(this.term.getTerm().field()); this.b = BM25Parameters.getB(); this.k1 = BM25Parameters.getK1(); this.termDocs = this.reader.termDocs(this.term.getTerm()); }
From source file:org.niord.core.LuceneTest.java
License:Apache License
@Test public void testLucene() throws IOException, ParseException { // Something seems to have change in Lucene. In earlier versions (e.g. 4.6), you could // use the StandardAnalyzer and quoted phrase searches including stop words (e.g. "the") // would still work. See example below: // However, in the current version of Lucene, I can only get this scenario to work with // SimpleAnalyzer for the QueryParser :-( Directory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); ft.setTokenized(true);/*from ww w . ja v a2s . c o m*/ ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.freeze(); doc.add(new Field("message", "The quick brown fox jumps over the lazy dog", ft)); writer.addDocument(doc); writer.close(); IndexReader reader = DirectoryReader.open(directory); assertEquals(1, reader.numDocs()); analyzer = new SimpleAnalyzer(); QueryParser parser = new ComplexPhraseQueryParser("message", analyzer); parser.setDefaultOperator(QueryParser.OR_OPERATOR); parser.setAllowLeadingWildcard(true); // NB: Expensive! Query query = parser.parse("\"over the lazy\" +quick -goat bro*"); IndexSearcher searcher = new IndexSearcher(reader); TopDocs results = searcher.search(query, 10000); assertEquals(1, results.scoreDocs.length); reader.close(); }
From source file:org.nlp4l.lucene.BuddyWordsFinderTermFilter.java
License:Apache License
public final void start(IndexReader reader, String field, BytesRef term) throws IOException { this.reader = reader; numDocs = reader.numDocs(); // TODO: can we ignore term value? currentTE = MultiFields.getTerms(reader, field).iterator(); if (currentTE != null) { currentTE.seekCeil(term);/* ww w . j a v a 2s. c o m*/ currentTermDocFreq = currentTE.docFreq(); } }
From source file:org.ohdsi.usagi.tests.TestLucene.java
License:Apache License
public static void main(String[] args) throws IOException, ParseException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); //Analyzer analyzer = new UsagiAnalyzer(); FieldType textVectorField = new FieldType(); textVectorField.setIndexed(true);//w w w.j ava2 s . c o m textVectorField.setTokenized(true); textVectorField.setStoreTermVectors(true); textVectorField.setStoreTermVectorPositions(false); textVectorField.setStoreTermVectorPayloads(false); textVectorField.setStoreTermVectorOffsets(false); textVectorField.setStored(true); textVectorField.freeze(); File indexFolder = new File(folder); if (indexFolder.exists()) DirectoryUtilities.deleteDir(indexFolder); Directory dir = FSDirectory.open(indexFolder); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new Field("F", "word1 word2 w3 word4", textVectorField)); writer.addDocument(doc); doc = new Document(); doc.add(new Field("F", "word1 word2 w3", textVectorField)); writer.addDocument(doc); writer.close(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(folder))); for (int i = 0; i < reader.numDocs(); i++) { TermsEnum termsEnum = reader.getTermVector(i, "F").iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { System.out.print(text.utf8ToString() + ","); } System.out.println(); } IndexSearcher searcher = new IndexSearcher(reader); // MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); // mlt.setMinTermFreq(0); // mlt.setMinDocFreq(0); // mlt.setMaxDocFreq(9999); // mlt.setMinWordLen(0); // mlt.setMaxWordLen(9999); // mlt.setMaxDocFreqPct(100); // mlt.setMaxNumTokensParsed(9999); // mlt.setMaxQueryTerms(9999); // mlt.setStopWords(null); // mlt.setFieldNames(new String[] { "F" }); // mlt.setAnalyzer(new UsagiAnalyzer()); // Query query = mlt.like("F", new StringReader("Systolic blood pressure")); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "F", analyzer); Query query = parser.parse("word1"); Explanation explanation = searcher.explain(query, 0); print(explanation); System.out.println(); explanation = searcher.explain(query, 1); print(explanation); System.out.println(); TopDocs topDocs = searcher.search(query, 99); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { System.out.println(scoreDoc.score + "\t" + reader.document(scoreDoc.doc).get("F")); } }