Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.archive.nutchwax.tools.GetUniqFieldValues.java

License:LGPL

private static void dumpUniqValues(String fieldName, String indexDir) throws Exception {
    IndexReader reader = IndexReader.open(indexDir);

    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    if (!fieldNames.contains(fieldName)) {
        System.out.println("Field not in index: " + fieldName);
        System.exit(2);/*from  w w w  .  j  a  va  2 s.c om*/
    }

    int numDocs = reader.numDocs();
    Set<String> values = new HashSet<String>();

    for (int i = 0; i < numDocs; i++) {
        values.add(reader.document(i).get(fieldName));
    }

    for (String v : values) {
        System.out.println(v);
    }

}

From source file:org.archive.tnh.FieldCacheLucene.java

License:Apache License

public static void main(String args[]) throws Exception {
    if (args.length == 0) {
        System.err.println("FieldCacheLucene: <index...>");
        System.exit(1);/*  w w w .  j  a  v  a 2 s.c  om*/
    }

    java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length);
    for (String arg : args) {
        try {
            IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

            readers.add(reader);
        } catch (IOException ioe) {
            System.err.println("Error reading: " + arg);
        }
    }

    FieldCacheLucene cache = new FieldCacheLucene("site");

    for (IndexReader reader : readers) {
        int numDocs = reader.numDocs();

        System.out.println("Index: " + reader);
        System.out.println("  numDocs: " + reader.numDocs());
        System.out.println("  docBase: -1");

        for (int i = 0; i < numDocs; i++) {
            System.out.println("  doc[" + i + "]: " + cache.getValue(reader, -1, i));
        }

    }

}

From source file:org.archive.tnh.FieldCachePreAllocated.java

License:Apache License

public static void main(String args[]) throws Exception {
    if (args.length == 0) {
        System.err.println("FieldCachePreAllocated: <index...>");
        System.exit(1);//from w  w w . j a  v a 2 s . com
    }

    java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length);
    int totalNumDocuments = 0;
    for (String arg : args) {
        try {
            IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

            totalNumDocuments += reader.numDocs();

            readers.add(reader);
        } catch (IOException ioe) {
            System.err.println("Error reading: " + arg);
        }
    }

    FieldCachePreAllocated siteCache = new FieldCachePreAllocated("site", readers.size(), totalNumDocuments);

    int docBase = 0;
    for (IndexReader reader : readers) {
        siteCache.getFieldCache(reader, docBase);

        docBase += reader.numDocs();
    }

    for (Map.Entry<IndexReader, Integer> e : siteCache.readerDocBases.entrySet()) {
        IndexReader reader = e.getKey();
        docBase = e.getValue();
        int numDocs = reader.numDocs();

        System.out.println("Index: " + reader);
        System.out.println("  numDocs: " + numDocs);
        System.out.println("  docBase: " + docBase);

        String[] sitePerDoc = siteCache.getFieldCache(reader, docBase);
        for (int i = 0; i < numDocs; i++) {
            System.out.println("  doc[" + i + "]: " + sitePerDoc[i + docBase]);
        }
    }
}

From source file:org.archive.tnh.servlet.DiagnosticServlet.java

License:Apache License

public void doGet(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    Document doc = new Document();
    Element root = new Element("info");

    doc.addContent(root);/*from ww  w.jav a2 s .co m*/

    Search search = (Search) this.getServletConfig().getServletContext().getAttribute("tnh.search");

    if (search == null) {
        OpenSearchHelper.writeResponse(doc, response);

        return;
    }

    Set<String> indexNames = new HashSet(Arrays.asList(ServletHelper.getParam(request, "i",
            search.searchers.keySet().toArray(QueryParameters.EMPTY_STRINGS))));
    Set<String> fieldNames = new HashSet(
            Arrays.asList(ServletHelper.getParam(request, "f", QueryParameters.EMPTY_STRINGS)));

    for (String indexName : indexNames) {
        Searcher searcher = search.searchers.get(indexName);

        if (searcher == null)
            continue;

        Element e = new Element("searcher");
        root.addContent(e);

        e.setAttribute("name", indexName);
        e.setAttribute("type", searcher.getClass().getCanonicalName());

        try {
            IndexReader ir = ((IndexSearcher) searcher).getIndexReader();

            Element ise = new Element("index");
            e.addContent(ise);

            ise.setAttribute("numDocs", Integer.toString(ir.numDocs()));

            TermDocs termDocs = ir.termDocs();
            for (String fieldName : ir.getFieldNames(IndexReader.FieldOption.ALL)
                    .toArray(QueryParameters.EMPTY_STRINGS)) {
                // If this field is not requested, skip it.
                if (!fieldNames.contains(fieldName)) {
                    continue;
                }

                Element field = new Element("field");
                field.setAttribute("name", fieldName);
                ise.addContent(field);

                // Iterate through the terms and for each term that
                // belongs to this field, count up the number of
                // documents containing that term and add it to the
                // XML Document.
                TermEnum termEnum = ir.terms(new Term(fieldName));
                do {
                    Term term = termEnum.term();

                    if (term == null || !fieldName.equals(term.field()))
                        continue;

                    termDocs.seek(termEnum);

                    int c = 0;
                    for (; termDocs.next(); c++)
                        ;

                    Element value = new Element("term");
                    value.setAttribute("name", term.text());
                    value.setAttribute("count", Integer.toString(c));

                    field.addContent(value);
                } while (termEnum.next());
            }
        } catch (ClassCastException cce) {
        }
    }

    OpenSearchHelper.writeResponse(doc, response);
}

From source file:org.archive.tnh.tools.IndexDumper.java

License:Apache License

private static void dumpIndex(IndexReader reader, List<String> fields, boolean includeDocIds) throws Exception {
    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    // If no fields were specified, then dump them all.
    if (fields.size() == 0) {
        fields.addAll(fieldNames);/*  ww  w . jav  a  2s.c  om*/
    } else {
        for (String field : fields) {
            if (!fieldNames.contains(field)) {
                System.out.println("Field not in index: " + field);
                System.exit(2);
            }
        }
    }

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        if (includeDocIds) {
            System.out.print(i + "\t");
        }

        for (String field : fields) {
            System.out.print(Arrays.toString(reader.document(i).getValues(field)));
            System.out.print("\t");
        }

        System.out.println();
    }

}

From source file:org.capelin.transaction.utils.TXLuceneRecordImporter.java

License:GNU General Public License

protected int importRecords(IndexReader reader, Session session) throws IOException {
    CapelinRecord data = null;/*w w w . java2  s.  c o m*/
    int totalDoc = reader.numDocs();
    // Read documents
    for (int i = 0; i < totalDoc; i++) {
        data = buildRecord(reader.document(i));
        if (null != data)
            session.save(data);
        if (i % BATCH_SIZE == 0) {
            session.flush(); // apply changes to indexes
            session.clear(); // free memory since the queue is processed
            log.info(i);
        }
    }
    return totalDoc;
}

From source file:org.deals.lucene.highlight.QueryTermExtractor.java

License:Apache License

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better 
 * b) use graded highlights eg chaning intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 *///ww  w  .  j a  v a  2s .  com
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) {
    WeightedTerm[] terms = getTerms(query, false, fieldName);
    int totalNumDocs = reader.numDocs();
    for (int i = 0; i < terms.length; i++) {
        try {
            int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
            //IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            terms[i].weight *= idf;
        } catch (IOException e) {
            //ignore 
        }
    }
    return terms;
}

From source file:org.deshang.content.indexing.scheduling.ContentIndexingTask.java

License:Apache License

private void calcPersonTermDocFreqInfo(TermDocFreqStatistics statistics, IndexReader reader)
        throws IOException {
    long docNum = reader.numDocs();
    LOGGER.debug("Total number of documents is " + docNum + ".");
    List<AtomicReaderContext> atomicCtxList = reader.leaves();
    for (AtomicReaderContext ctx : atomicCtxList) {
        FilterAtomicReader far = new FilterAtomicReader(ctx.reader());
        for (String field : far.fields()) {
            Terms terms = far.fields().terms(field);
            LOGGER.debug("Reader [" + far.toString() + "] totally has " + terms.size() + " term(s).");
            TermsEnum termsEnum = terms.iterator(null);
            BytesRef term = null;/*from  w  w  w.  jav a 2 s. c o  m*/
            while ((term = termsEnum.next()) != null) {
                String termUtf8String = term.utf8ToString();
                int existPersonDocFreq = statistics.getTermPersonDocFreq(termUtf8String);
                int personDocFreq = far.docFreq(new Term(field, term));
                double personDocFreqPercent = ((double) personDocFreq) / docNum;
                if (existPersonDocFreq < 0) {
                    personDocFreq += statistics.getTermPersonDocFreq(termUtf8String);
                    personDocFreqPercent += statistics.getTermPersonDocFreqPercent(termUtf8String);
                }
                statistics.putTermPersonDocFreqInfo(termUtf8String, personDocFreq, personDocFreqPercent);
            }
        }
        far.close();
    }
}

From source file:org.dspace.search.DSIndexer.java

License:BSD License

/**
 * Iterates over all documents in the Lucene index and verifies they
 * are in database, if not, they are removed.
 *
 * @param context//  www .  j a va  2  s  .  c  o  m
 * @throws IOException
 * @throws SQLException
 */
public static void cleanIndex(Context context) throws IOException, SQLException {

    IndexReader reader = DSQuery.getIndexReader();

    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < reader.numDocs(); i++) {
        if (!liveDocs.get(i)) {
            // document is deleted...
            log.debug("Encountered deleted doc: " + i);
        } else {
            Document doc = reader.document(i);
            String handle = doc.get("handle");
            if (!StringUtils.isEmpty(handle)) {
                DSpaceObject o = HandleManager.resolveToObject(context, handle);

                if (o == null) {
                    log.info("Deleting: " + handle);
                    /* Use IndexWriter to delete, its easier to manage write.lock */
                    DSIndexer.unIndexContent(context, handle);
                } else {
                    context.removeCached(o, o.getID());
                    log.debug("Keeping: " + handle);
                }
            }
        }
    }
}

From source file:org.dyndns.andreasbaumann.LuceneAnalyzer.java

License:Open Source License

private static void printGlobalInfo(IndexReader indexReader, boolean printHeaders, boolean isSolr,
        SolrIndexSearcher solrSearch) throws IOException {
    if (printHeaders) {
        System.out.println("Global Information:");
        System.out.println("===================");
    }/*w  w w .  ja  v a  2 s. co m*/

    System.out.println("\tnumber of documents: " + indexReader.numDocs());

    // we should get the number of features differently, this is inefficient, but Lucene
    // has no notion of global statistics (because the default weighting schema doesn't
    // make use of it!)
    int nofFeatures = 0;
    int nofTokens = 0;
    TermEnum terms = indexReader.terms();
    while (terms.next()) {
        Term term = terms.term();
        int df = terms.docFreq();
        nofFeatures++;
        nofTokens += df;
    }
    System.out.println("\ttotal number of features: " + nofFeatures);
    System.out.println("\ttotal number of tokens: " + nofTokens);

    System.out.println("\tversion: " + indexReader.getVersion());
    System.out.println("\tstill current: " + indexReader.isCurrent());

    //TODO: we don't get segment information!
    //System.out.println( "is optimized:" + segmentInfos.size( ) == 1 && !indexReader.hasDeletions( ) );
    System.out.println("\tmaximal document number: " + indexReader.maxDoc());
    System.out.println("\thas deletions: " + indexReader.hasDeletions());

    if (isSolr) {
        System.out.println("\tSolr version: " + solrSearch.getVersion());
    }

    System.out.println("");
}