Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.archive.nutchwax.tools.GetUniqFieldValues.java

License:LGPL

private static void dumpUniqValues(String fieldName, String indexDir) throws Exception {
    IndexReader reader = IndexReader.open(indexDir);

    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    if (!fieldNames.contains(fieldName)) {
        System.out.println("Field not in index: " + fieldName);
        System.exit(2);/*from  w w w  .  j  a  va  2 s.c om*/
    }

    int numDocs = reader.numDocs();
    Set<String> values = new HashSet<String>();

    for (int i = 0; i < numDocs; i++) {
        values.add(reader.document(i).get(fieldName));
    }

    for (String v : values) {
        System.out.println(v);
    }

}

From source file:org.archive.tnh.FieldCacheLucene.java

License:Apache License

public static void main(String args[]) throws Exception {
    if (args.length == 0) {
        System.err.println("FieldCacheLucene: <index...>");
        System.exit(1);/*  w w w .  j  a  v  a 2 s.c  om*/
    }

    java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length);
    for (String arg : args) {
        try {
            IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

            readers.add(reader);
        } catch (IOException ioe) {
            System.err.println("Error reading: " + arg);
        }
    }

    FieldCacheLucene cache = new FieldCacheLucene("site");

    for (IndexReader reader : readers) {
        int numDocs = reader.numDocs();

        System.out.println("Index: " + reader);
        System.out.println("  numDocs: " + reader.numDocs());
        System.out.println("  docBase: -1");

        for (int i = 0; i < numDocs; i++) {
            System.out.println("  doc[" + i + "]: " + cache.getValue(reader, -1, i));
        }

    }

}

From source file:org.archive.tnh.FieldCachePreAllocated.java

License:Apache License

public static void main(String args[]) throws Exception {
    if (args.length == 0) {
        System.err.println("FieldCachePreAllocated: <index...>");
        System.exit(1);//from w  w w . j a  v a 2 s . com
    }

    java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length);
    int totalNumDocuments = 0;
    for (String arg : args) {
        try {
            IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

            totalNumDocuments += reader.numDocs();

            readers.add(reader);
        } catch (IOException ioe) {
            System.err.println("Error reading: " + arg);
        }
    }

    FieldCachePreAllocated siteCache = new FieldCachePreAllocated("site", readers.size(), totalNumDocuments);

    int docBase = 0;
    for (IndexReader reader : readers) {
        siteCache.getFieldCache(reader, docBase);

        docBase += reader.numDocs();
    }

    for (Map.Entry<IndexReader, Integer> e : siteCache.readerDocBases.entrySet()) {
        IndexReader reader = e.getKey();
        docBase = e.getValue();
        int numDocs = reader.numDocs();

        System.out.println("Index: " + reader);
        System.out.println("  numDocs: " + numDocs);
        System.out.println("  docBase: " + docBase);

        String[] sitePerDoc = siteCache.getFieldCache(reader, docBase);
        for (int i = 0; i < numDocs; i++) {
            System.out.println("  doc[" + i + "]: " + sitePerDoc[i + docBase]);
        }
    }
}

From source file:org.archive.tnh.servlet.DiagnosticServlet.java

License:Apache License

public void doGet(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    Document doc = new Document();
    Element root = new Element("info");

    doc.addContent(root);/*from ww  w.jav a2 s .co m*/

    Search search = (Search) this.getServletConfig().getServletContext().getAttribute("tnh.search");

    if (search == null) {
        OpenSearchHelper.writeResponse(doc, response);

        return;
    }

    Set<String> indexNames = new HashSet(Arrays.asList(ServletHelper.getParam(request, "i",
            search.searchers.keySet().toArray(QueryParameters.EMPTY_STRINGS))));
    Set<String> fieldNames = new HashSet(
            Arrays.asList(ServletHelper.getParam(request, "f", QueryParameters.EMPTY_STRINGS)));

    for (String indexName : indexNames) {
        Searcher searcher = search.searchers.get(indexName);

        if (searcher == null)
            continue;

        Element e = new Element("searcher");
        root.addContent(e);

        e.setAttribute("name", indexName);
        e.setAttribute("type", searcher.getClass().getCanonicalName());

        try {
            IndexReader ir = ((IndexSearcher) searcher).getIndexReader();

            Element ise = new Element("index");
            e.addContent(ise);

            ise.setAttribute("numDocs", Integer.toString(ir.numDocs()));

            TermDocs termDocs = ir.termDocs();
            for (String fieldName : ir.getFieldNames(IndexReader.FieldOption.ALL)
                    .toArray(QueryParameters.EMPTY_STRINGS)) {
                // If this field is not requested, skip it.
                if (!fieldNames.contains(fieldName)) {
                    continue;
                }

                Element field = new Element("field");
                field.setAttribute("name", fieldName);
                ise.addContent(field);

                // Iterate through the terms and for each term that
                // belongs to this field, count up the number of
                // documents containing that term and add it to the
                // XML Document.
                TermEnum termEnum = ir.terms(new Term(fieldName));
                do {
                    Term term = termEnum.term();

                    if (term == null || !fieldName.equals(term.field()))
                        continue;

                    termDocs.seek(termEnum);

                    int c = 0;
                    for (; termDocs.next(); c++)
                        ;

                    Element value = new Element("term");
                    value.setAttribute("name", term.text());
                    value.setAttribute("count", Integer.toString(c));

                    field.addContent(value);
                } while (termEnum.next());
            }
        } catch (ClassCastException cce) {
        }
    }

    OpenSearchHelper.writeResponse(doc, response);
}

From source file:org.archive.tnh.tools.IndexDumper.java

License:Apache License

private static void dumpIndex(IndexReader reader, List<String> fields, boolean includeDocIds) throws Exception {
    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    // If no fields were specified, then dump them all.
    if (fields.size() == 0) {
        fields.addAll(fieldNames);/*  ww  w . jav  a  2s.c  om*/
    } else {
        for (String field : fields) {
            if (!fieldNames.contains(field)) {
                System.out.println("Field not in index: " + field);
                System.exit(2);
            }
        }
    }

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        if (includeDocIds) {
            System.out.print(i + "\t");
        }

        for (String field : fields) {
            System.out.print(Arrays.toString(reader.document(i).getValues(field)));
            System.out.print("\t");
        }

        System.out.println();
    }

}

From source file:org.capelin.transaction.utils.TXLuceneRecordImporter.java

License:GNU General Public License

protected int importRecords(IndexReader reader, Session session) throws IOException {
    CapelinRecord data = null;/*w w w . java2  s.  c o m*/
    int totalDoc = reader.numDocs();
    // Read documents
    for (int i = 0; i < totalDoc; i++) {
        data = buildRecord(reader.document(i));
        if (null != data)
            session.save(data);
        if (i % BATCH_SIZE == 0) {
            session.flush(); // apply changes to indexes
            session.clear(); // free memory since the queue is processed
            log.info(i);
        }
    }
    return totalDoc;
}

From source file:org.deals.lucene.highlight.QueryTermExtractor.java

License:Apache License

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better 
 * b) use graded highlights eg chaning intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 *///ww  w  .  j a  v a  2s .  com
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) {
    WeightedTerm[] terms = getTerms(query, false, fieldName);
    int totalNumDocs = reader.numDocs();
    for (int i = 0; i < terms.length; i++) {
        try {
            int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
            //IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            terms[i].weight *= idf;
        } catch (IOException e) {
            //ignore 
        }
    }
    return terms;
}

From source file:org.deshang.content.indexing.scheduling.ContentIndexingTask.java

License:Apache License

private void calcPersonTermDocFreqInfo(TermDocFreqStatistics statistics, IndexReader reader)
        throws IOException {
    long docNum = reader.numDocs();
    LOGGER.debug("Total number of documents is " + docNum + ".");
    List<AtomicReaderContext> atomicCtxList = reader.leaves();
    for (AtomicReaderContext ctx : atomicCtxList) {
        FilterAtomicReader far = new FilterAtomicReader(ctx.reader());
        for (String field : far.fields()) {
            Terms terms = far.fields().terms(field);
            LOGGER.debug("Reader [" + far.toString() + "] totally has " + terms.size() + " term(s).");
            TermsEnum termsEnum = terms.iterator(null);
            BytesRef term = null;/*from  w  w  w.  jav a 2 s. c o  m*/
            while ((term = termsEnum.next()) != null) {
                String termUtf8String = term.utf8ToString();
                int existPersonDocFreq = statistics.getTermPersonDocFreq(termUtf8String);
                int personDocFreq = far.docFreq(new Term(field, term));
                double personDocFreqPercent = ((double) personDocFreq) / docNum;
                if (existPersonDocFreq < 0) {
                    personDocFreq += statistics.getTermPersonDocFreq(termUtf8String);
                    personDocFreqPercent += statistics.getTermPersonDocFreqPercent(termUtf8String);
                }
                statistics.putTermPersonDocFreqInfo(termUtf8String, personDocFreq, personDocFreqPercent);
            }
        }
        far.close();
    }
}

From source file:org.dspace.search.DSIndexer.java

License:BSD License

/**
 * Iterates over all documents in the Lucene index and verifies they
 * are in database, if not, they are removed.
 *
 * @param context//  www .  j a va  2  s  .  c  o  m
 * @throws IOException
 * @throws SQLException
 */
public static void cleanIndex(Context context) throws IOException, SQLException {

    IndexReader reader = DSQuery.getIndexReader();

    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < reader.numDocs(); i++) {
        if (!liveDocs.get(i)) {
            // document is deleted...
            log.debug("Encountered deleted doc: " + i);
        } else {
            Document doc = reader.document(i);
            String handle = doc.get("handle");
            if (!StringUtils.isEmpty(handle)) {
                DSpaceObject o = HandleManager.resolveToObject(context, handle);

                if (o == null) {
                    log.info("Deleting: " + handle);
                    /* Use IndexWriter to delete, its easier to manage write.lock */
                    DSIndexer.unIndexContent(context, handle);
                } else {
                    context.removeCached(o, o.getID());
                    log.debug("Keeping: " + handle);
                }
            }
        }
    }
}

From source file:org.dyndns.andreasbaumann.LuceneAnalyzer.java

License:Open Source License

private static void printGlobalInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr,
        SolrIndexSearcher solrSearch) throws IOException {
    if (printHeaders) {
        System.out.println("Global Information:");
        System.out.println("===================");
    }/*w  w w .  ja  v a  2 s. co m*/

    System.out.println("\tnumber of documents: " + indexReader.numDocs());

    // we should get the number of features differently, this is inefficient, but Lucene
    // has no notion of global statistics (because the default weighting schema doesn't
    // make use of it!)
    int nofFeatures = 0;
    int nofTokens = 0;
    TermEnum terms = indexReader.terms();
    while (terms.next()) {
        Term term = terms.term();
        int df = terms.docFreq();
        nofFeatures++;
        nofTokens += df;
    }
    System.out.println("\ttotal number of features: " + nofFeatures);
    System.out.println("\ttotal number of tokens: " + nofTokens);

    System.out.println("\tversion: " + indexReader.getVersion());
    System.out.println("\tstill current: " + indexReader.isCurrent());

    //TODO: we don't get segment information!
    //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) );
    System.out.println("\tmaximal document number: " + indexReader.maxDoc());
    System.out.println("\thas deletions: " + indexReader.hasDeletions());

    if (isSolr) {
        System.out.println("\tSolr version: " + solrSearch.getVersion());
    }

    System.out.println("");
}