List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:com.esri.gpt.catalog.lucene.stats.SingleFieldStats.java
License:Apache License
/** * Executes the collection of statistics. * @param request the active statistics request * @param reader the index reader//from w w w . j a v a 2s . c o m * @throws IOException if an error occurs while communicating with the index */ public void collectStats(StatsRequest request, IndexReader reader) throws IOException { long t1 = System.currentTimeMillis(); TermEnum termEnum = null; TermDocs termDocs = null; try { OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet(); OpenBitSet docsWithFieldBitSet = new OpenBitSet(reader.maxDoc()); boolean isUnfiltered = (documentFilterBitSet == null); boolean checkMaxFreq = (this.maxFrequency > 0); boolean checkMinFreq = (this.minFrequency > 0); // return if there are no stats to collect if (this.determineNumberOfDocsConsidered(reader, documentFilterBitSet) <= 0) { return; } else if (!request.isFieldCollectable(this.fieldName)) { return; } boolean checkTermDocs = true; if (isUnfiltered) { MetadataAcl acl = new MetadataAcl(request.getRequestContext()); if (acl.isPolicyUnrestricted()) { if (this.getNumberOfDocsConsidered() > 25000) { checkTermDocs = false; } } } // accumulate term frequencies per field termEnum = reader.terms(new Term(this.fieldName)); termDocs = reader.termDocs(); do { Term term = termEnum.term(); if (term != null && term.field().equals(this.fieldName)) { if (checkTermDocs) { termDocs.seek(term); long count = 0; while (termDocs.next()) { int docId = termDocs.doc(); boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId); if (bSet) { docsWithFieldBitSet.fastSet(docId); count++; } } if ((!checkMaxFreq || (count <= this.maxFrequency)) && (!checkMinFreq || (count >= this.minFrequency))) { this.termAccumulator.add(term.text(), count); } } else { long count = termEnum.docFreq(); if ((!checkMaxFreq || (count <= this.maxFrequency)) && (!checkMinFreq || (count >= this.minFrequency))) { this.termAccumulator.add(term.text(), count); } } } else { break; } } while (termEnum.next()); // sort this.numberOfDocsWithField = docsWithFieldBitSet.cardinality(); if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) { this.termAccumulator.sortByName(); } else { this.termAccumulator.sortByFrequency(); } } finally { try { if (termEnum != null) termEnum.close(); } catch (Exception ef) { } try { if (termDocs != null) termDocs.close(); } catch (Exception ef) { } this.setTimeMillis(System.currentTimeMillis() - t1); } // print if (request.getResponseWriter() != null) { this.print(request); } }
From source file:com.esri.gpt.catalog.lucene.stats.SingleTermStats.java
License:Apache License
/** * Executes the collection of statistics. * @param request the active statistics request * @param reader the index reader/*from w w w .j av a 2 s .c o m*/ * @throws IOException if an error occurs while communicating with the index */ public void collectStats(StatsRequest request, IndexReader reader) throws IOException { long t1 = System.currentTimeMillis(); TermEnum termEnum = null; TermDocs termDocs = null; try { OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet(); OpenBitSet docsWithTermBitSet = new OpenBitSet(reader.maxDoc()); boolean isUnfiltered = (documentFilterBitSet == null); // return if there are no stats to collect String[] fieldNames = request.getCollectableFieldNames(reader); if (this.determineNumberOfDocsConsidered(reader, documentFilterBitSet) <= 0) { return; } else if ((fieldNames == null) || (fieldNames.length == 0)) { return; } //Map<String,Long,> // accumlate term frequencies per field termDocs = reader.termDocs(); for (String fieldName : fieldNames) { termEnum = reader.terms(new Term(fieldName, this.text)); do { Term term = termEnum.term(); if (term != null && term.field().equals(fieldName)) { if (!term.text().equals(this.text)) { break; } termDocs.seek(term); long count = 0; while (termDocs.next()) { int docId = termDocs.doc(); boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId); if (bSet) { docsWithTermBitSet.fastSet(docId); count++; //this.fieldAccumulator.add(fieldName,termDocs.freq()); } } this.fieldAccumulator.add(fieldName, count); } else { break; } } while (termEnum.next()); termEnum.close(); termEnum = null; } // sort this.numberOfDocsWithTerm = docsWithTermBitSet.cardinality(); if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) { this.fieldAccumulator.sortByName(); } else { this.fieldAccumulator.sortByFrequency(); } } finally { try { if (termEnum != null) termEnum.close(); } catch (Exception ef) { } try { if (termDocs != null) termDocs.close(); } catch (Exception ef) { } this.setTimeMillis(System.currentTimeMillis() - t1); } // print if (request.getResponseWriter() != null) { this.print(request); } }
From source file:com.esri.gpt.server.assertion.index.AsnFilter.java
License:Apache License
/** * Applies the filter and returns a DocIdSet of matching documents. * @param reader the index reader/*from ww w.j a va2 s . co m*/ * @return the DocIdSet the matching documents * @throws IOException if an exception is encountered while reading the index */ public DocIdSet getDocIdSet(IndexReader reader) throws IOException { long t1 = System.currentTimeMillis(); OpenBitSet bitSet = null; if (this.chain == null) { bitSet = this.queryValue(reader); } else { String operand = Val.chkStr(this.chainedOperand); bitSet = new OpenBitSet(reader.maxDoc()); for (int i = 0; i < this.chain.length; i++) { AsnFilter asnFilter = this.chain[i]; if (i == 0) { bitSet = (OpenBitSet) asnFilter.getDocIdSet(reader); } else { OpenBitSet subBitSet = (OpenBitSet) asnFilter.getDocIdSet(reader); if (operand.equalsIgnoreCase(AsnFilter.CHAINED_OPERAND_AND)) { if (bitSet.cardinality() > 0) { bitSet.and(subBitSet); } } else if (operand.equalsIgnoreCase(AsnFilter.CHAINED_OPERAND_OR)) { bitSet.or(subBitSet); } else if (operand.equalsIgnoreCase(AsnFilter.CHAINED_OPERAND_ANDNOT)) { if (bitSet.cardinality() > 0) { bitSet.andNot(subBitSet); } } else { if (bitSet.cardinality() > 0) { bitSet.and(subBitSet); } } } } } setTimeMillis(System.currentTimeMillis() - t1); return bitSet; }
From source file:com.esri.gpt.server.assertion.index.AsnFilter.java
License:Apache License
/** * Queries for documents that match the supplied value. * @param reader the index reader//from w w w .j a va 2 s .com * @return the OpenBitSet (documents with matches are set to true) * @throws IOException if an exception is encountered while reading the index */ private OpenBitSet queryValue(IndexReader reader) throws IOException { OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); if ((this.value != null) && (this.value.length() > 0)) { TermDocs termDocs = null; try { Term term = new Term(this.fieldName, this.value); termDocs = reader.termDocs(); termDocs.seek(term); while (termDocs.next()) { bitSet.set(termDocs.doc()); } } finally { try { if (termDocs != null) termDocs.close(); } catch (Exception ef) { } } } return bitSet; }
From source file:com.facebook.presto.example.LuceneRecordCursor.java
License:Apache License
public LuceneRecordCursor(List<LuceneColumnHandle> columnHandles) throws ParseException { this.columnHandles = columnHandles; IndexReader reader = null; try {/*from w w w . ja va 2 s . com*/ reader = DirectoryReader .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index"))); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } searcher = new IndexSearcher(reader); this.NumDoc = reader.maxDoc(); fieldToColumnIndex = new int[columnHandles.size()]; for (int i = 0; i < columnHandles.size(); i++) { LuceneColumnHandle columnHandle = columnHandles.get(i); fieldToColumnIndex[i] = columnHandle.getOrdinalPosition(); } }
From source file:com.flaptor.hounder.indexer.IndexManager.java
License:Apache License
/** * Searchs all the index to find the largest AddId. * @return the largest AddId found in the index. *//*from ww w. j a va 2s.com*/ private long findLargestAddId() { long max = 1; IndexReader reader = null; try { reader = workIndex.getReader(); int num = reader.maxDoc(); for (int i = 0; i < num; i++) { if (!reader.isDeleted(i)) { String val = reader.document(i).get("AddId"); if (null != val) { long n = new Long(val).longValue(); if (max < n) { max = n; } } } } } catch (IOException e) { logger.fatal("Could not read from the index to get the last AddId." + e); throw new RuntimeException("Error reading the index when looking for initial AddId.", e); } finally { Execute.close(reader, logger); } logger.debug("Largest AddId found: " + max); return max; }
From source file:com.flaptor.hounder.indexer.MultiIndexerTest.java
License:Apache License
@TestInfo(testType = TestInfo.TestType.UNIT, requiresPort = { 30000, 31000, 32000 }) public void testMultiIndexer() throws Exception { MultiIndexer multiIndexer = new MultiIndexer(); for (int i = 0; i < documents; i++) { String document = "<documentAdd><documentId>doc" + i + "</documentId><field name=\"content\" stored=\"true\" indexed=\"true\" tokenized=\"true\">content " + i + "</field></documentAdd>"; multiIndexer.index(document);//ww w .j av a 2 s . c om } Execute.sleep(5000); multiIndexer.requestStop(); int totalDocumentsFound = 0; int minDocsFound = documents / (2 * numIndexers); for (String tmpDir : tmpDirs) { String sep = java.io.File.separator; IndexReader reader = IndexReader.open(tmpDir + sep + "indexer" + sep + "indexes" + sep + "index"); int docsFound = reader.maxDoc(); reader.close(); assertTrue("too few documents indexed. Found " + docsFound + ", expected at least" + minDocsFound, docsFound > minDocsFound); totalDocumentsFound += docsFound; } assertEquals("Did not index every document.", totalDocumentsFound, documents); }
From source file:com.flaptor.hounder.indexer.MultiIndexerTest.java
License:Apache License
@TestInfo(testType = TestInfo.TestType.UNIT, requiresPort = { 30000, 31000, 32000 }) public void testMixedNodesFails() throws Exception { // Mix up hosts Config config = Config.getConfig("multiIndexer.properties"); String[] hosts = config.getStringArray("indexer.hosts"); StringBuffer sb = new StringBuffer(); for (int i = hosts.length - 1; i >= 0; i--) { sb.append(hosts[i]);/* w ww . j a va 2 s . c o m*/ sb.append(","); } config.set("indexer.hosts", sb.substring(0, sb.length() - 1)); // filter output to avoid logging expected errors super.filterOutputRegex("Unfortunately"); MultiIndexer multiIndexer = new MultiIndexer(); for (int i = 0; i < documents; i++) { String document = "<documentAdd><documentId>doc" + i + "</documentId><field name=\"content\" stored=\"true\" indexed=\"true\" tokenized=\"true\">content " + i + "</field></documentAdd>"; multiIndexer.index(document); } Execute.sleep(5000); multiIndexer.requestStop(); // check that every index is empty for (String tmpDir : tmpDirs) { String sep = java.io.File.separator; IndexReader reader = IndexReader.open(tmpDir + sep + "indexer" + sep + "indexes" + sep + "index"); assertEquals( "indexed " + reader.maxDoc() + " documents on mixed indexer. Mixed indexers should not index anything.", 0, reader.maxDoc()); reader.close(); } }
From source file:com.flaptor.hounder.util.Idx.java
License:Apache License
public static void main(String arg[]) throws Exception { check(arg.length > 1, null);/*from w w w . j a v a 2s . c o m*/ String cmd = arg[0]; File idx = new File(arg[1]); if ("list".equals(cmd)) { int num = (arg.length > 2) ? Integer.parseInt(arg[2]) : -1; check(idx.exists(), "Index dir not found"); IndexReader reader = IndexReader.open(idx); int docs = reader.numDocs(); int max = reader.maxDoc(); System.err.println("Index contains " + docs + " documents plus " + (max - docs) + " deleted."); if (num > -1) { if (num == 0) num = docs; for (int i = 0; i < max && i < num; i++) { System.out.println("----------------------------------------"); if (!reader.isDeleted(i)) { Document doc = reader.document(i); List flds = doc.getFields(); Iterator iter = flds.iterator(); while (iter.hasNext()) { Field fld = (Field) iter.next(); String attr = (fld.isIndexed() ? ",i" : "") + (fld.isStored() ? ",s" : "") + (fld.isTokenized() ? ",t" : ""); System.out.println(fld.name() + attr + ": " + fld.stringValue()); } } } reader.close(); System.out.println(); } } else if ("search".equals(cmd)) { check(idx.exists(), "Index dir not found"); check(arg.length > 3, "Not enough arguments"); String field = arg[2]; String value = arg[3]; IndexSearcher searcher = new IndexSearcher(IndexReader.open(idx)); ScorelessHitCollector collector = new HashSetScorelessHitCollector(); searcher.search(new TermQuery(new Term(field, value)), collector); Set<Integer> docIds = collector.getMatchingDocuments(); System.out.println("\nNumber of hits: " + docIds.size() + "\n"); for (Integer docId : docIds) { Document doc = searcher.doc(docId); List flds = doc.getFields(); Iterator iter = flds.iterator(); while (iter.hasNext()) { Field fld = (Field) iter.next(); System.out.println(fld.name() + ": " + fld.stringValue()); } } searcher.close(); System.out.println(); } else if ("delete".equals(cmd)) { check(idx.exists(), "Index dir not found"); check(arg.length > 3, "Not enough arguments"); String field = arg[2]; String value = arg[3]; IndexReader reader = IndexReader.open(idx); reader.deleteDocuments(new Term(field, value)); reader.close(); } else if ("optimize".equals(cmd)) { IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.optimize(); writer.close(); } else if ("merge".equals(cmd)) { check(arg.length == 3, "not enough parameters"); File idx2 = new File(arg[2]); check(idx.exists(), "Index dir 1 not found"); check(idx2.exists(), "Index dir 2 not found"); IndexReader reader = IndexReader.open(idx2); IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.addIndexes(new IndexReader[] { reader }); writer.close(); reader.close(); } else if ("term-count".equals(cmd)) { check(arg.length == 3, "not enough parameters"); check(idx.exists(), "Index dir not found"); IndexReader reader = IndexReader.open(idx); String field = arg[2]; int count = 0; TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); if (term.field().equals(field)) count++; } terms.close(); reader.close(); System.out.println("Found " + count + " different values for field " + field); } else if ("hit-count".equals(cmd)) { check(arg.length > 3, "Not enough arguments"); check(idx.exists(), "Index dir not found"); String field = arg[2]; String value = arg[3]; IndexSearcher searcher = new IndexSearcher(IndexReader.open(idx)); CountingHitCollector collector = new CountingHitCollector(); searcher.search(new TermQuery(new Term(field, value)), collector); System.out.println("\nNumber of hits: " + collector.getDocCount() + "\n"); searcher.close(); } else if ("uncompound".equals(cmd)) { IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.setUseCompoundFile(false); writer.optimize(); writer.close(); } else if ("compound".equals(cmd)) { IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED); writer.setUseCompoundFile(true); writer.optimize(); writer.close(); } else if ("terms".equals(cmd)) { check(arg.length == 3, "not enough parameters"); check(idx.exists(), "Index dir not found"); String field = arg[2]; IndexReader reader = IndexReader.open(idx); TermEnum terms = reader.terms(); while (terms.next()) { Term t = terms.term(); if (t.field().equals(field)) { System.out.println(t.text()); } } } }
From source file:com.github.flaxsearch.resources.DocumentResource.java
License:Apache License
@GET public DocumentData getDocument(@QueryParam("segment") Integer segment, @PathParam("docId") int doc) throws IOException { IndexReader reader = segment == null ? readerManager.getIndexReader() : readerManager.getLeafReader(segment); if (doc < 0 || doc > reader.maxDoc()) { throw new WebApplicationException("Unknown document " + doc, Response.Status.NOT_FOUND); }//from w w w .j ava 2 s. c o m Document document = reader.document(doc); return new DocumentData(document); }