Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:com.esri.gpt.catalog.lucene.stats.SingleFieldStats.java

License:Apache License

/**
 * Executes the collection of statistics.
 * @param request the active statistics request
 * @param reader the index reader//from w  w  w . j  a  v a  2s . c o  m
 * @throws IOException if an error occurs while communicating with the index
 */
public void collectStats(StatsRequest request, IndexReader reader) throws IOException {
    long t1 = System.currentTimeMillis();
    TermEnum termEnum = null;
    TermDocs termDocs = null;

    try {

        OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet();
        OpenBitSet docsWithFieldBitSet = new OpenBitSet(reader.maxDoc());
        boolean isUnfiltered = (documentFilterBitSet == null);
        boolean checkMaxFreq = (this.maxFrequency > 0);
        boolean checkMinFreq = (this.minFrequency > 0);

        // return if there are no stats to collect
        if (this.determineNumberOfDocsConsidered(reader, documentFilterBitSet) <= 0) {
            return;
        } else if (!request.isFieldCollectable(this.fieldName)) {
            return;
        }

        boolean checkTermDocs = true;
        if (isUnfiltered) {
            MetadataAcl acl = new MetadataAcl(request.getRequestContext());
            if (acl.isPolicyUnrestricted()) {
                if (this.getNumberOfDocsConsidered() > 25000) {
                    checkTermDocs = false;
                }
            }
        }

        // accumulate term frequencies per field
        termEnum = reader.terms(new Term(this.fieldName));
        termDocs = reader.termDocs();
        do {
            Term term = termEnum.term();
            if (term != null && term.field().equals(this.fieldName)) {

                if (checkTermDocs) {
                    termDocs.seek(term);
                    long count = 0;
                    while (termDocs.next()) {
                        int docId = termDocs.doc();
                        boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId);
                        if (bSet) {
                            docsWithFieldBitSet.fastSet(docId);
                            count++;
                        }
                    }
                    if ((!checkMaxFreq || (count <= this.maxFrequency))
                            && (!checkMinFreq || (count >= this.minFrequency))) {
                        this.termAccumulator.add(term.text(), count);
                    }

                } else {
                    long count = termEnum.docFreq();
                    if ((!checkMaxFreq || (count <= this.maxFrequency))
                            && (!checkMinFreq || (count >= this.minFrequency))) {
                        this.termAccumulator.add(term.text(), count);
                    }
                }

            } else {
                break;
            }
        } while (termEnum.next());

        // sort
        this.numberOfDocsWithField = docsWithFieldBitSet.cardinality();
        if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) {
            this.termAccumulator.sortByName();
        } else {
            this.termAccumulator.sortByFrequency();
        }

    } finally {
        try {
            if (termEnum != null)
                termEnum.close();
        } catch (Exception ef) {
        }
        try {
            if (termDocs != null)
                termDocs.close();
        } catch (Exception ef) {
        }
        this.setTimeMillis(System.currentTimeMillis() - t1);
    }

    // print
    if (request.getResponseWriter() != null) {
        this.print(request);
    }

}

From source file:com.esri.gpt.catalog.lucene.stats.SingleTermStats.java

License:Apache License

/**
 * Executes the collection of statistics.
 * @param request the active statistics request
 * @param reader the index reader/*from  w  w w .j av a  2  s  .c  o  m*/
 * @throws IOException if an error occurs while communicating with the index
 */
public void collectStats(StatsRequest request, IndexReader reader) throws IOException {
    long t1 = System.currentTimeMillis();
    TermEnum termEnum = null;
    TermDocs termDocs = null;
    try {
        OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet();
        OpenBitSet docsWithTermBitSet = new OpenBitSet(reader.maxDoc());
        boolean isUnfiltered = (documentFilterBitSet == null);

        // return if there are no stats to collect
        String[] fieldNames = request.getCollectableFieldNames(reader);
        if (this.determineNumberOfDocsConsidered(reader, documentFilterBitSet) <= 0) {
            return;
        } else if ((fieldNames == null) || (fieldNames.length == 0)) {
            return;
        }

        //Map<String,Long,>

        // accumlate term frequencies per field
        termDocs = reader.termDocs();
        for (String fieldName : fieldNames) {
            termEnum = reader.terms(new Term(fieldName, this.text));
            do {
                Term term = termEnum.term();
                if (term != null && term.field().equals(fieldName)) {
                    if (!term.text().equals(this.text)) {
                        break;
                    }

                    termDocs.seek(term);
                    long count = 0;
                    while (termDocs.next()) {
                        int docId = termDocs.doc();
                        boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId);
                        if (bSet) {
                            docsWithTermBitSet.fastSet(docId);
                            count++;
                            //this.fieldAccumulator.add(fieldName,termDocs.freq());
                        }
                    }
                    this.fieldAccumulator.add(fieldName, count);

                } else {
                    break;
                }
            } while (termEnum.next());
            termEnum.close();
            termEnum = null;

        }

        // sort
        this.numberOfDocsWithTerm = docsWithTermBitSet.cardinality();
        if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) {
            this.fieldAccumulator.sortByName();
        } else {
            this.fieldAccumulator.sortByFrequency();
        }

    } finally {
        try {
            if (termEnum != null)
                termEnum.close();
        } catch (Exception ef) {
        }
        try {
            if (termDocs != null)
                termDocs.close();
        } catch (Exception ef) {
        }
        this.setTimeMillis(System.currentTimeMillis() - t1);
    }

    // print
    if (request.getResponseWriter() != null) {
        this.print(request);
    }
}

From source file:com.esri.gpt.server.assertion.index.AsnFilter.java

License:Apache License

/**
 * Applies the filter and returns a DocIdSet of matching documents.
 * @param reader the index reader/*from ww w.j  a va2 s .  co  m*/
 * @return the DocIdSet the matching documents
 * @throws IOException if an exception is encountered while reading the index
 */
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    long t1 = System.currentTimeMillis();
    OpenBitSet bitSet = null;
    if (this.chain == null) {
        bitSet = this.queryValue(reader);
    } else {
        String operand = Val.chkStr(this.chainedOperand);
        bitSet = new OpenBitSet(reader.maxDoc());
        for (int i = 0; i < this.chain.length; i++) {
            AsnFilter asnFilter = this.chain[i];
            if (i == 0) {
                bitSet = (OpenBitSet) asnFilter.getDocIdSet(reader);
            } else {
                OpenBitSet subBitSet = (OpenBitSet) asnFilter.getDocIdSet(reader);
                if (operand.equalsIgnoreCase(AsnFilter.CHAINED_OPERAND_AND)) {
                    if (bitSet.cardinality() > 0) {
                        bitSet.and(subBitSet);
                    }
                } else if (operand.equalsIgnoreCase(AsnFilter.CHAINED_OPERAND_OR)) {
                    bitSet.or(subBitSet);
                } else if (operand.equalsIgnoreCase(AsnFilter.CHAINED_OPERAND_ANDNOT)) {
                    if (bitSet.cardinality() > 0) {
                        bitSet.andNot(subBitSet);
                    }
                } else {
                    if (bitSet.cardinality() > 0) {
                        bitSet.and(subBitSet);
                    }
                }
            }
        }
    }
    setTimeMillis(System.currentTimeMillis() - t1);
    return bitSet;
}

From source file:com.esri.gpt.server.assertion.index.AsnFilter.java

License:Apache License

/**
 * Queries for documents that match the supplied value.
 * @param reader the index reader//from  w w w .j  a va  2 s .com
 * @return the OpenBitSet (documents with matches are set to true)
 * @throws IOException if an exception is encountered while reading the index
 */
private OpenBitSet queryValue(IndexReader reader) throws IOException {
    OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
    if ((this.value != null) && (this.value.length() > 0)) {
        TermDocs termDocs = null;
        try {
            Term term = new Term(this.fieldName, this.value);
            termDocs = reader.termDocs();
            termDocs.seek(term);
            while (termDocs.next()) {
                bitSet.set(termDocs.doc());
            }
        } finally {
            try {
                if (termDocs != null)
                    termDocs.close();
            } catch (Exception ef) {
            }
        }
    }
    return bitSet;
}

From source file:com.facebook.presto.example.LuceneRecordCursor.java

License:Apache License

public LuceneRecordCursor(List<LuceneColumnHandle> columnHandles) throws ParseException {

    this.columnHandles = columnHandles;

    IndexReader reader = null;
    try {/*from  w  w w .  ja va 2  s . com*/
        reader = DirectoryReader
                .open(FSDirectory.open(Paths.get("/home/liyong/workspace-neno/lucenetest/index")));
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    searcher = new IndexSearcher(reader);
    this.NumDoc = reader.maxDoc();

    fieldToColumnIndex = new int[columnHandles.size()];
    for (int i = 0; i < columnHandles.size(); i++) {
        LuceneColumnHandle columnHandle = columnHandles.get(i);
        fieldToColumnIndex[i] = columnHandle.getOrdinalPosition();
    }
}

From source file:com.flaptor.hounder.indexer.IndexManager.java

License:Apache License

/**
 * Searchs all the index to find the largest AddId.
 * @return the largest AddId found in the index.
 *//*from ww w. j  a va 2s.com*/
private long findLargestAddId() {
    long max = 1;
    IndexReader reader = null;
    try {
        reader = workIndex.getReader();
        int num = reader.maxDoc();
        for (int i = 0; i < num; i++) {
            if (!reader.isDeleted(i)) {
                String val = reader.document(i).get("AddId");
                if (null != val) {
                    long n = new Long(val).longValue();
                    if (max < n) {
                        max = n;
                    }
                }
            }
        }
    } catch (IOException e) {
        logger.fatal("Could not read from the index to get the last AddId." + e);
        throw new RuntimeException("Error reading the index when looking for initial AddId.", e);
    } finally {
        Execute.close(reader, logger);
    }
    logger.debug("Largest AddId found: " + max);
    return max;
}

From source file:com.flaptor.hounder.indexer.MultiIndexerTest.java

License:Apache License

@TestInfo(testType = TestInfo.TestType.UNIT, requiresPort = { 30000, 31000, 32000 })
public void testMultiIndexer() throws Exception {
    MultiIndexer multiIndexer = new MultiIndexer();

    for (int i = 0; i < documents; i++) {
        String document = "<documentAdd><documentId>doc" + i
                + "</documentId><field name=\"content\" stored=\"true\" indexed=\"true\" tokenized=\"true\">content "
                + i + "</field></documentAdd>";
        multiIndexer.index(document);//ww w  .j  av a 2  s  .  c  om
    }
    Execute.sleep(5000);
    multiIndexer.requestStop();

    int totalDocumentsFound = 0;
    int minDocsFound = documents / (2 * numIndexers);
    for (String tmpDir : tmpDirs) {
        String sep = java.io.File.separator;
        IndexReader reader = IndexReader.open(tmpDir + sep + "indexer" + sep + "indexes" + sep + "index");
        int docsFound = reader.maxDoc();
        reader.close();
        assertTrue("too few documents indexed. Found " + docsFound + ", expected at least" + minDocsFound,
                docsFound > minDocsFound);
        totalDocumentsFound += docsFound;
    }

    assertEquals("Did not index every document.", totalDocumentsFound, documents);
}

From source file:com.flaptor.hounder.indexer.MultiIndexerTest.java

License:Apache License

@TestInfo(testType = TestInfo.TestType.UNIT, requiresPort = { 30000, 31000, 32000 })
public void testMixedNodesFails() throws Exception {
    // Mix up hosts
    Config config = Config.getConfig("multiIndexer.properties");
    String[] hosts = config.getStringArray("indexer.hosts");
    StringBuffer sb = new StringBuffer();
    for (int i = hosts.length - 1; i >= 0; i--) {
        sb.append(hosts[i]);/* w  ww  . j  a va  2  s  .  c o  m*/
        sb.append(",");
    }
    config.set("indexer.hosts", sb.substring(0, sb.length() - 1));

    // filter output to avoid logging expected errors
    super.filterOutputRegex("Unfortunately");

    MultiIndexer multiIndexer = new MultiIndexer();
    for (int i = 0; i < documents; i++) {
        String document = "<documentAdd><documentId>doc" + i
                + "</documentId><field name=\"content\" stored=\"true\" indexed=\"true\" tokenized=\"true\">content "
                + i + "</field></documentAdd>";
        multiIndexer.index(document);
    }
    Execute.sleep(5000);
    multiIndexer.requestStop();

    // check that every index is empty
    for (String tmpDir : tmpDirs) {
        String sep = java.io.File.separator;
        IndexReader reader = IndexReader.open(tmpDir + sep + "indexer" + sep + "indexes" + sep + "index");
        assertEquals(
                "indexed " + reader.maxDoc()
                        + " documents on mixed indexer. Mixed indexers should not index anything.",
                0, reader.maxDoc());
        reader.close();
    }
}

From source file:com.flaptor.hounder.util.Idx.java

License:Apache License

public static void main(String arg[]) throws Exception {
    check(arg.length > 1, null);/*from   w  w w .  j a v  a  2s  . c  o m*/
    String cmd = arg[0];
    File idx = new File(arg[1]);
    if ("list".equals(cmd)) {
        int num = (arg.length > 2) ? Integer.parseInt(arg[2]) : -1;
        check(idx.exists(), "Index dir not found");
        IndexReader reader = IndexReader.open(idx);
        int docs = reader.numDocs();
        int max = reader.maxDoc();
        System.err.println("Index contains " + docs + " documents plus " + (max - docs) + " deleted.");
        if (num > -1) {
            if (num == 0)
                num = docs;
            for (int i = 0; i < max && i < num; i++) {
                System.out.println("----------------------------------------");
                if (!reader.isDeleted(i)) {
                    Document doc = reader.document(i);
                    List flds = doc.getFields();
                    Iterator iter = flds.iterator();
                    while (iter.hasNext()) {
                        Field fld = (Field) iter.next();
                        String attr = (fld.isIndexed() ? ",i" : "") + (fld.isStored() ? ",s" : "")
                                + (fld.isTokenized() ? ",t" : "");
                        System.out.println(fld.name() + attr + ": " + fld.stringValue());
                    }
                }
            }
            reader.close();
            System.out.println();
        }
    } else if ("search".equals(cmd)) {
        check(idx.exists(), "Index dir not found");
        check(arg.length > 3, "Not enough arguments");
        String field = arg[2];
        String value = arg[3];
        IndexSearcher searcher = new IndexSearcher(IndexReader.open(idx));
        ScorelessHitCollector collector = new HashSetScorelessHitCollector();
        searcher.search(new TermQuery(new Term(field, value)), collector);
        Set<Integer> docIds = collector.getMatchingDocuments();
        System.out.println("\nNumber of hits: " + docIds.size() + "\n");
        for (Integer docId : docIds) {
            Document doc = searcher.doc(docId);
            List flds = doc.getFields();
            Iterator iter = flds.iterator();
            while (iter.hasNext()) {
                Field fld = (Field) iter.next();
                System.out.println(fld.name() + ": " + fld.stringValue());
            }
        }
        searcher.close();
        System.out.println();
    } else if ("delete".equals(cmd)) {
        check(idx.exists(), "Index dir not found");
        check(arg.length > 3, "Not enough arguments");
        String field = arg[2];
        String value = arg[3];
        IndexReader reader = IndexReader.open(idx);
        reader.deleteDocuments(new Term(field, value));
        reader.close();
    } else if ("optimize".equals(cmd)) {
        IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.optimize();
        writer.close();
    } else if ("merge".equals(cmd)) {
        check(arg.length == 3, "not enough parameters");
        File idx2 = new File(arg[2]);
        check(idx.exists(), "Index dir 1 not found");
        check(idx2.exists(), "Index dir 2 not found");
        IndexReader reader = IndexReader.open(idx2);
        IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.addIndexes(new IndexReader[] { reader });
        writer.close();
        reader.close();
    } else if ("term-count".equals(cmd)) {
        check(arg.length == 3, "not enough parameters");
        check(idx.exists(), "Index dir not found");
        IndexReader reader = IndexReader.open(idx);
        String field = arg[2];
        int count = 0;
        TermEnum terms = reader.terms();
        while (terms.next()) {
            Term term = terms.term();
            if (term.field().equals(field))
                count++;
        }
        terms.close();
        reader.close();
        System.out.println("Found " + count + " different values for field " + field);
    } else if ("hit-count".equals(cmd)) {
        check(arg.length > 3, "Not enough arguments");
        check(idx.exists(), "Index dir not found");
        String field = arg[2];
        String value = arg[3];
        IndexSearcher searcher = new IndexSearcher(IndexReader.open(idx));
        CountingHitCollector collector = new CountingHitCollector();
        searcher.search(new TermQuery(new Term(field, value)), collector);
        System.out.println("\nNumber of hits: " + collector.getDocCount() + "\n");
        searcher.close();
    } else if ("uncompound".equals(cmd)) {
        IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(false);
        writer.optimize();
        writer.close();
    } else if ("compound".equals(cmd)) {
        IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), false,
                IndexWriter.MaxFieldLength.UNLIMITED);
        writer.setUseCompoundFile(true);
        writer.optimize();
        writer.close();
    } else if ("terms".equals(cmd)) {
        check(arg.length == 3, "not enough parameters");
        check(idx.exists(), "Index dir not found");
        String field = arg[2];
        IndexReader reader = IndexReader.open(idx);
        TermEnum terms = reader.terms();
        while (terms.next()) {
            Term t = terms.term();
            if (t.field().equals(field)) {
                System.out.println(t.text());
            }
        }

    }

}

From source file:com.github.flaxsearch.resources.DocumentResource.java

License:Apache License

@GET
public DocumentData getDocument(@QueryParam("segment") Integer segment, @PathParam("docId") int doc)
        throws IOException {

    IndexReader reader = segment == null ? readerManager.getIndexReader()
            : readerManager.getLeafReader(segment);

    if (doc < 0 || doc > reader.maxDoc()) {
        throw new WebApplicationException("Unknown document " + doc, Response.Status.NOT_FOUND);
    }//from  w w w .j  ava 2  s. c  o  m

    Document document = reader.document(doc);
    return new DocumentData(document);
}