Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.tallison.lucene.corpus.stats.IDFIndexCalc.java

License:Apache License

public IDFIndexCalc(IndexReader reader) {
    super(reader.numDocs());
    this.reader = reader;
}

From source file:org.tallison.solr.search.concordance.KeywordCooccurRankHandler.java

License:Apache License

public static NamedList doLocalSearch(Query filter, SolrQueryRequest req) throws Exception {
    SolrParams params = req.getParams();
    String field = getField(params);

    String fl = params.get(CommonParams.FL);
    DocMetadataExtractor metadataExtractor = (fl != null && fl.length() > 0)
            ? new SimpleDocMetadataExtractor(fl.split(","))
            : new SimpleDocMetadataExtractor();

    CooccurConfig config = configureParams(field, params);

    IndexSchema schema = req.getSchema();
    SchemaField sf = schema.getField(field);
    Analyzer analyzer = sf.getType().getIndexAnalyzer();
    Filter queryFilter = getFilterQuery(req);
    String q = params.get(CommonParams.Q);
    Query query = QParser.getParser(q, null, req).parse();
    String solrUniqueKeyField = req.getSchema().getUniqueKeyField().getName();

    SolrIndexSearcher solr = req.getSearcher();
    IndexReader reader = solr.getIndexReader();
    boolean allowDuplicates = false;
    boolean allowFieldSeparators = false;

    Grammer grammer = new WGrammer(config.getMinNGram(), config.getMaxNGram(), allowFieldSeparators);
    IDFCalc idfCalc = new IDFCalc(reader);

    CooccurVisitor visitor = new CooccurVisitor(field, config.getTokensBefore(), config.getTokensAfter(),
            grammer, idfCalc, config.getMaxWindows(), allowDuplicates);

    visitor.setMinTermFreq(config.getMinTermFreq());

    try {//from  w ww  .j ava  2s.c o m
        ConcordanceArrayWindowSearcher searcher = new ConcordanceArrayWindowSearcher();
        System.out.println("UNIQUE KEY FIELD: " + solrUniqueKeyField);
        DocIdBuilder docIdBuilder = new FieldBasedDocIdBuilder(solrUniqueKeyField);
        System.out.println("QUERY: " + query.toString());
        searcher.search(reader, field, query, queryFilter, analyzer, visitor, docIdBuilder);
    } catch (IllegalArgumentException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (TargetTokenNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    List<TermIDF> overallResults = visitor.getResults();
    NamedList results = toNamedList(overallResults);
    //needed for cloud computations, merging cores

    results.add("collectionSize", reader.numDocs());
    results.add("numDocsVisited", visitor.getNumDocsVisited());
    results.add("numWindowsVisited", visitor.getNumWindowsVisited());
    results.add("numResults", overallResults.size());
    results.add("minTF", visitor.getMinTermFreq());

    return results;
}

From source file:org.toubassi.femtozip.lucene.IndexAnalyzer.java

License:Apache License

private IndexReader openIndex(String path) throws IOException {
    IndexReader reader = IndexReader.open(path);

    totalIndexSize = FileUtil.computeSize(new File(path));
    totalNumDocs = reader.numDocs();

    return reader;
}

From source file:org.toubassi.femtozip.lucene.IndexDocumentList.java

License:Apache License

public IndexDocumentList(IndexReader reader, int numSamples, int firstDoc, String fieldName)
        throws IOException {
    this.reader = reader;
    this.fieldName = fieldName;
    numDocs = reader.numDocs();
    float samplingRate = ((float) numSamples) / numDocs;

    ArrayList<Integer> docIdsList = new ArrayList<Integer>();
    ArrayList<Integer> fieldCountList = new ArrayList<Integer>();

    int numDocsScanned = 0, numDocsSampled = 0;
    for (int i = firstDoc, count = reader.maxDoc(); i < count; i++) {
        numDocsScanned++;/*w  w w .j  av  a2 s . c  om*/

        if (reader.isDeleted(i)) {
            continue;
        }

        if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) {
            continue;
        }

        numDocsSampled++;

        Document doc = reader.document(i);
        Field fields[] = doc.getFields(fieldName);
        if (fields.length > 0) {
            if (fields[0].isStored()) {
                docIdsList.add(i);
                fieldCountList.add(fields.length);
            }
        }
    }

    docIds = new int[docIdsList.size()];
    for (int i = 0, count = docIdsList.size(); i < count; i++) {
        docIds[i] = docIdsList.get(i);
    }

    fieldCounts = new int[fieldCountList.size()];
    for (int i = 0, count = fieldCountList.size(); i < count; i++) {
        fieldCounts[i] = fieldCountList.get(i);
        if (i > 0) {
            fieldCounts[i] += fieldCounts[i - 1];
        }
    }
}

From source file:org.toubassi.femtozip.lucene.IndexDumper.java

License:Apache License

protected void dump() throws IOException {
    IndexReader reader = IndexReader.open(indexPath);

    Collection<?> allFields = reader.getFieldNames(IndexReader.FieldOption.ALL);
    String[] fieldNames = new String[allFields.size()];
    allFields.toArray(fieldNames);/*from w w  w.j a  v a2 s .c  o  m*/

    numDocs = reader.numDocs();
    int maxDocId = reader.maxDoc();
    float samplingRate = ((float) numSamples) / numDocs;

    int numDocsScanned = 0;
    int numDocsSampled = 0;
    for (int docId = 0; docId < maxDocId; docId++) {

        if (reader.isDeleted(docId)) {
            continue;
        }

        numDocsScanned++;

        if (((int) (numDocsScanned * samplingRate)) <= numDocsSampled) {
            continue;
        }

        numDocsSampled++;

        Document doc = reader.document(docId);

        System.out.println("DOCUMENT: " + docId);

        for (String fieldName : fieldNames) {
            if (fieldsToDump != null && fieldsToDump.indexOf(fieldName) == -1) {
                continue;
            }

            Field[] fields = doc.getFields(fieldName);

            for (Field field : fields) {

                if (!field.isStored() || field.isCompressed()) {
                    // TODO if its compressed, uncompress it and benchmark it.
                    continue;
                }

                byte[] bytes;

                if (field.isBinary()) {
                    bytes = new byte[field.getBinaryLength()];
                    System.arraycopy(field.getBinaryValue(), field.getBinaryOffset(), bytes, 0,
                            field.getBinaryLength());
                } else {
                    String value = field.stringValue();
                    bytes = value.getBytes("UTF-8");
                }

                if (bytes.length > 0) {
                    System.out.print("    " + fieldName + " " + bytes.length + " ");
                    System.out.write(bytes);
                    System.out.println();
                }
            }
        }
    }

    reader.close();
}

From source file:org.zilverline.core.AbstractCollection.java

License:Open Source License

/**
 * Get the number of documents in this collection. The number is not
 * calculated, but stored after indexing process, so it is a cheap
 * operation.// w  ww  .j  a  v  a 2 s  .c o m
 * 
 * @return number of documents in collection
 */
public final int getNumberOfDocs() {
    if (isIndexingInProgress()) {
        IndexReader index = null;
        try {
            File thisIndex = getIndexDirWithManagerDefaults();
            index = IndexReader.open(thisIndex);

            if (index != null) {
                return index.numDocs();
            }
        } catch (IOException e) {
            log.warn("Error getting index for collection '" + name + "'", e);
        } finally {
            if (index != null) {
                try {
                    index.close();
                } catch (IOException e1) {
                    log.error("Error closing index for collection " + name, e1);
                }
            }
        }
    }
    return numberOfDocs;
}

From source file:org.zilverline.core.AbstractCollection.java

License:Open Source License

/**
 * Initialize this collection by getting its index. It retrieves the number
 * of documents and the MD5 hash of all documents in the collection.
 * /*from www  .java 2  s  . c  o  m*/
 * If the index does not exist (this is a new Collection) just return.
 * 
 * @throws IndexException
 *             when existing index of Collection can not be succesfully
 *             opened.
 */
public final void init() throws IndexException {
    log.debug("Initializing collection " + name);
    IndexReader index = null;
    // Determine whether the collection exists on disk
    setExistsOnDisk();
    // check whether this collection has a cache for the MD5 hashes of
    // documents
    if (md5DocumentCache == null) {
        md5DocumentCache = new HashSet();
    }
    // check whether this collection has a cache for the MD5 hashes of
    // indexed archives
    if (archiveCache == null) {
        archiveCache = new HashSet();
    }
    if (!isIndexValid()) {
        log.info("Index does not exist (yet) for collection '" + name + "'. Possibly new collection.");
        numberOfDocs = 0;
        return;
    }

    // Get the index
    File thisIndex = getIndexDirWithManagerDefaults();
    try {
        index = IndexReader.open(thisIndex);

        if (index != null) {
            numberOfDocs = index.numDocs();
            // retrieve all hashes of Documents from the cache
            md5DocumentCache.clear();
            for (int i = 0; i < numberOfDocs; i++) {
                Document d = index.document(i);
                String hashValue = d.get("hash");
                md5DocumentCache.add(hashValue);
            }
            // get some relevant information from the index
            version = IndexReader.getCurrentVersion(thisIndex);
            // deprecated, but needed
            lastIndexed = new Date(IndexReader.lastModified(thisIndex));
            log.debug("Collection " + name + " has " + numberOfDocs + " documents, index created at: "
                    + lastIndexed);
        } else {
            log.error("Index could not be retrieved for collection " + name);
        }
    } catch (IOException e) {
        throw new IndexException("Error initializing collection '" + name + "'", e);
    } finally {
        if (index != null) {
            try {
                index.close();
            } catch (IOException e1) {
                log.error("Error closing index for collection " + name, e1);
            }
        } else {
            numberOfDocs = 0;
            version = 0;
            lastIndexed = null;
        }
    }
}

From source file:proj.zoie.impl.indexing.internal.DiskSearchIndex.java

License:Apache License

/**
 * Gets the number of docs in the current loaded index
 * @return number of docs/*from  w w w.  ja  va 2 s  .  co m*/
 */
public int getNumdocs() {
    IndexReader reader = _dispenser.getIndexReader();
    if (reader != null) {
        return reader.numDocs();
    } else {
        return 0;
    }
}

From source file:project.lucene.RelativeTermWeightQuery.java

License:Apache License

@Override
public Query rewrite(IndexReader reader) throws IOException {
    if (this.terms.isEmpty()) {
        return new BooleanQuery();
    } else if (this.terms.size() == 1) {
        final Query tq = newTermQuery(this.terms.get(0), null);
        tq.setBoost(getBoost());//from w  w  w  .  j  a v  a  2s  . c o m
        return tq;
    }
    final List<AtomicReaderContext> leaves = reader.leaves();
    final int totalDocs = reader.numDocs();
    final TermContext[] contextArray = new TermContext[terms.size()];
    final Term[] queryTerms = this.terms.toArray(new Term[0]);

    collectTermContext(reader, leaves, contextArray, queryTerms);
    return buildQuery(totalDocs, contextArray, queryTerms);
}

From source file:retriever.TermStats.java

TermStats(String term, int tf, IndexReader reader) throws Exception {
    this.term = term;
    this.tf = tf;
    idf = (float) (Math.log(reader.numDocs()
            / (float) (reader.docFreq(new Term(TextDocIndexer.FIELD_ANALYZED_CONTENT, term)))));
}