Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.tallison.lucene.corpus.stats.IDFIndexCalc.java

License:Apache License

public IDFIndexCalc(IndexReader reader) {
    super(reader.numDocs());
    this.reader = reader;
}

From source file:org.tallison.solr.search.concordance.KeywordCooccurRankHandler.java

License:Apache License

public static NamedList doLocalSearch(Query filter, SolrQueryRequest req) throws Exception {
    SolrParams params = req.getParams();
    String field = getField(params);

    String fl = params.get(CommonParams.FL);
    DocMetadataExtractor metadataExtractor = (fl != null && fl.length() > 0)
            ? new SimpleDocMetadataExtractor(fl.split(","))
            : new SimpleDocMetadataExtractor();

    CooccurConfig config = configureParams(field, params);

    IndexSchema schema = req.getSchema();
    SchemaField sf = schema.getField(field);
    Analyzer analyzer = sf.getType().getIndexAnalyzer();
    Filter queryFilter = getFilterQuery(req);
    String q = params.get(CommonParams.Q);
    Query query = QParser.getParser(q, null, req).parse();
    String solrUniqueKeyField = req.getSchema().getUniqueKeyField().getName();

    SolrIndexSearcher solr = req.getSearcher();
    IndexReader reader = solr.getIndexReader();
    boolean allowDuplicates = false;
    boolean allowFieldSeparators = false;

    Grammer grammer = new WGrammer(config.getMinNGram(), config.getMaxNGram(), allowFieldSeparators);
    IDFCalc idfCalc = new IDFCalc(reader);

    CooccurVisitor visitor = new CooccurVisitor(field, config.getTokensBefore(), config.getTokensAfter(),
            grammer, idfCalc, config.getMaxWindows(), allowDuplicates);

    visitor.setMinTermFreq(config.getMinTermFreq());

    try {//from  w ww  .j ava  2s.c o m
        ConcordanceArrayWindowSearcher searcher = new ConcordanceArrayWindowSearcher();
        System.out.println("UNIQUE KEY FIELD: " + solrUniqueKeyField);
        DocIdBuilder docIdBuilder = new FieldBasedDocIdBuilder(solrUniqueKeyField);
        System.out.println("QUERY: " + query.toString());
        searcher.search(reader, field, query, queryFilter, analyzer, visitor, docIdBuilder);
    } catch (IllegalArgumentException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (TargetTokenNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    List<TermIDF> overallResults = visitor.getResults();
    NamedList results = toNamedList(overallResults);
    //needed for cloud computations, merging cores

    results.add("collectionSize", reader.numDocs());
    results.add("numDocsVisited", visitor.getNumDocsVisited());
    results.add("numWindowsVisited", visitor.getNumWindowsVisited());
    results.add("numResults", overallResults.size());
    results.add("minTF", visitor.getMinTermFreq());

    return results;
}

From source file:org.toubassi.femtozip.lucene.IndexAnalyzer.java

License:Apache License

private IndexReader openIndex(String path) throws IOException {
    IndexReader reader = IndexReader.open(path);

    totalIndexSize = FileUtil.computeSize(new File(path));
    totalNumDocs = reader.numDocs();

    return reader;
}

From source file:org.toubassi.femtozip.lucene.IndexDocumentList.java

License:Apache License

public IndexDocumentList(IndexReader reader, int numSamples, int firstDoc, String fieldName)
        throws IOException {
    this.reader = reader;
    this.fieldName = fieldName;
    numDocs = reader.numDocs();
    float samplingRate = ((float) numSamples) / numDocs;

    ArrayList<Integer> docIdsList = new ArrayList<Integer>();
    ArrayList<Integer> fieldCountList = new ArrayList<Integer>();

    int numDocsScanned = 0, numDocsSampled = 0;
    for (int i = firstDoc, count = reader.maxDoc(); i < count; i++) {
        numDocsScanned++;/*w  w w .j  av  a2 s . c  om*/

        if (reader.isDeleted(i)) {
            continue;
        }

        if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) {
            continue;
        }

        numDocsSampled++;

        Document doc = reader.document(i);
        Field fields[] = doc.getFields(fieldName);
        if (fields.length > 0) {
            if (fields[0].isStored()) {
                docIdsList.add(i);
                fieldCountList.add(fields.length);
            }
        }
    }

    docIds = new int[docIdsList.size()];
    for (int i = 0, count = docIdsList.size(); i < count; i++) {
        docIds[i] = docIdsList.get(i);
    }

    fieldCounts = new int[fieldCountList.size()];
    for (int i = 0, count = fieldCountList.size(); i < count; i++) {
        fieldCounts[i] = fieldCountList.get(i);
        if (i > 0) {
            fieldCounts[i] += fieldCounts[i - 1];
        }
    }
}

From source file:org.toubassi.femtozip.lucene.IndexDumper.java

License:Apache License

protected void dump() throws IOException {
    IndexReader reader = IndexReader.open(indexPath);

    Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL);
    String[] fieldNames = new String[allFields.size()];
    allFields.toArray(fieldNames);/*from w w  w.j a  v a2 s .c  o  m*/

    numDocs = reader.numDocs();
    int maxDocId = reader.maxDoc();
    float samplingRate = ((float) numSamples) / numDocs;

    int numDocsScanned = 0;
    int numDocsSampled = 0;
    for (int docId = 0; docId < maxDocId; docId++) {

        if (reader.isDeleted(docId)) {
            continue;
        }

        numDocsScanned++;

        if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) {
            continue;
        }

        numDocsSampled++;

        Document doc = reader.document(docId);

        System.out.println("DOCUMENT: " + docId);

        for (String fieldName : fieldNames) {
            if (fieldsToDump != null && fieldsToDump.indexOf(fieldName) == -1) {
                continue;
            }

            Field[] fields = doc.getFields(fieldName);

            for (Field field : fields) {

                if (!field.isStored() || field.isCompressed()) {
                    // TODO if its compressed, uncompress it and benchmark it.
                    continue;
                }

                byte[] bytes;

                if (field.isBinary()) {
                    bytes = new byte[field.getBinaryLength()];
                    System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0,
                            field.getBinaryLength());
                } else {
                    String value = field.stringValue();
                    bytes = value.getBytes("UTF-8");
                }

                if (bytes.length > 0) {
                    System.out.print("    " + fieldName + " " + bytes.length + " ");
                    System.out.write(bytes);
                    System.out.println();
                }
            }
        }
    }

    reader.close();
}

From source file:org.zilverline.core.AbstractCollection.java

License:Open Source License

/**
 * Get the number of documents in this collection. The number is not
 * calculated, but stored after indexing process, so it is a cheap
 * operation.// w  ww  .j  a  v  a 2 s  .c o m
 * 
 * @return number of documents in collection
 */
public final int getNumberOfDocs() {
    if (isIndexingInProgress()) {
        IndexReader index = null;
        try {
            File thisIndex = getIndexDirWithManagerDefaults();
            index = IndexReader.open(thisIndex);

            if (index != null) {
                return index.numDocs();
            }
        } catch (IOException e) {
            log.warn("Error getting index for collection '" + name + "'", e);
        } finally {
            if (index != null) {
                try {
                    index.close();
                } catch (IOException e1) {
                    log.error("Error closing index for collection " + name, e1);
                }
            }
        }
    }
    return numberOfDocs;
}

From source file:org.zilverline.core.AbstractCollection.java

License:Open Source License

/**
 * Initialize this collection by getting its index. It retrieves the number
 * of documents and the MD5 hash of all documents in the collection.
 * /*from www  .java 2  s  . c  o  m*/
 * If the index does not exist (this is a new Collection) just return.
 * 
 * @throws IndexException
 *             when existing index of Collection can not be succesfully
 *             opened.
 */
public final void init() throws IndexException {
    log.debug("Initializing collection " + name);
    IndexReader index = null;
    // Determine whether the collection exists on disk
    setExistsOnDisk();
    // check whether this collection has a cache for the MD5 hashes of
    // documents
    if (md5DocumentCache == null) {
        md5DocumentCache = new HashSet();
    }
    // check whether this collection has a cache for the MD5 hashes of
    // indexed archives
    if (archiveCache == null) {
        archiveCache = new HashSet();
    }
    if (!isIndexValid()) {
        log.info("Index does not exist (yet) for collection '" + name + "'. Possibly new collection.");
        numberOfDocs = 0;
        return;
    }

    // Get the index
    File thisIndex = getIndexDirWithManagerDefaults();
    try {
        index = IndexReader.open(thisIndex);

        if (index != null) {
            numberOfDocs = index.numDocs();
            // retrieve all hashes of Documents from the cache
            md5DocumentCache.clear();
            for (int i = 0; i < numberOfDocs; i++) {
                Document d = index.document(i);
                String hashValue = d.get("hash");
                md5DocumentCache.add(hashValue);
            }
            // get some relevant information from the index
            version = IndexReader.getCurrentVersion(thisIndex);
            // deprecated, but needed
            lastIndexed = new Date(IndexReader.lastModified(thisIndex));
            log.debug("Collection " + name + " has " + numberOfDocs + " documents, index created at: "
                    + lastIndexed);
        } else {
            log.error("Index could not be retrieved for collection " + name);
        }
    } catch (IOException e) {
        throw new IndexException("Error initializing collection '" + name + "'", e);
    } finally {
        if (index != null) {
            try {
                index.close();
            } catch (IOException e1) {
                log.error("Error closing index for collection " + name, e1);
            }
        } else {
            numberOfDocs = 0;
            version = 0;
            lastIndexed = null;
        }
    }
}

From source file:proj.zoie.impl.indexing.internal.DiskSearchIndex.java

License:Apache License

/**
 * Gets the number of docs in the current loaded index
 * @return number of docs/*from  w w w.  ja  va 2 s  .  co m*/
 */
public int getNumdocs() {
    IndexReader reader = _dispenser.getIndexReader();
    if (reader != null) {
        return reader.numDocs();
    } else {
        return 0;
    }
}

From source file:project.lucene.RelativeTermWeightQuery.java

License:Apache License

@Override
public Query rewrite(IndexReader reader) throws IOException {
    if (this.terms.isEmpty()) {
        return new BooleanQuery();
    } else if (this.terms.size() == 1) {
        final Query tq = newTermQuery(this.terms.get(0), null);
        tq.setBoost(getBoost());//from w  w  w  .  j  a v  a  2s  . c o m
        return tq;
    }
    final List<AtomicReaderContext> leaves = reader.leaves();
    final int totalDocs = reader.numDocs();
    final TermContext[] contextArray = new TermContext[terms.size()];
    final Term[] queryTerms = this.terms.toArray(new Term[0]);

    collectTermContext(reader, leaves, contextArray, queryTerms);
    return buildQuery(totalDocs, contextArray, queryTerms);
}

From source file:retriever.TermStats.java

TermStats(String term, int tf, IndexReader reader) throws Exception {
    this.term = term;
    this.tf = tf;
    idf = (float) (Math.log(reader.numDocs()
            / (float) (reader.docFreq(new Term(TextDocIndexer.FIELD_ANALYZED_CONTENT, term)))));
}