Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.LuceneDocIterator.java

License:Open Source License

public LuceneDocIterator(IndexReader reader, Set<String> fieldsToLoad) {
    this.reader = reader;
    this.fieldsToLoad = fieldsToLoad;
    pointer = 0;/*from w ww .  jav  a2 s  . c  o  m*/
    max = reader.numDocs();
}

From source file:edu.mit.ll.vizlinc.highlight.QueryTermExtractor.java

License:Apache License

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better 
 * b) use graded highlights eg changing intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 *///w w  w  . ja  v  a 2s  . c  om
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) {
    WeightedTerm[] terms = getTerms(query, false, fieldName);
    int totalNumDocs = reader.numDocs();
    for (int i = 0; i < terms.length; i++) {
        try {
            int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
            // docFreq counts deletes
            if (totalNumDocs < docFreq) {
                docFreq = totalNumDocs;
            }
            //IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            terms[i].weight *= idf;
        } catch (IOException e) {
            //ignore 
        }
    }
    return terms;
}

From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java

License:Apache License

/**
 * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
 * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
 * //from  w ww .j  a  v a 2  s.  c o  m
 * <p>
 * 
 * @param query
 *          that caused hit
 * @param tokenStream
 *          of text to be highlighted
 * @param fieldName
 *          restricts Term's used based on field name
 * @param reader
 *          to use for scoring
 * @return Map of WeightedSpanTerms with quasi tf/idf scores
 * @throws IOException
 */
public Map<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,
        String fieldName, IndexReader reader) throws IOException {
    if (fieldName != null) {
        this.fieldName = StringHelper.intern(fieldName);
    } else {
        this.fieldName = null;
    }
    this.tokenStream = tokenStream;

    Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
    extract(query, terms);

    int totalNumDocs = reader.numDocs();
    Set<String> weightedTerms = terms.keySet();
    Iterator<String> it = weightedTerms.iterator();

    try {
        while (it.hasNext()) {
            WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
            int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
            // docFreq counts deletes
            if (totalNumDocs < docFreq) {
                docFreq = totalNumDocs;
            }
            // IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            weightedSpanTerm.weight *= idf;
        }
    } finally {

        closeReaders();
    }

    return terms;
}

From source file:edu.rpi.tw.linkipedia.search.main.helper.ReadIndex.java

License:Open Source License

public static void main(String[] args) {
    try {/*from   w  w w . j a v  a  2  s. c om*/
        if (args.length < 1) {
            System.out.println("index directory");
            return;
        }

        INDEX_DIR = args[0];

        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(INDEX_DIR)));
        IndexSearcher searcher = new IndexSearcher(reader);
        System.out.println(reader.numDocs());
        while (true) {
            BufferedReader in = null;
            String text = "";
            try {
                in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
                text = in.readLine();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            String[] mytext = text.split("\\|");
            if (mytext.length > 1) {
                text = mytext[0];
            }
            if (text.contains(":")) {
                String[] fiedValue = text.split(":", 2);
                readIndexByTerm(searcher, fiedValue[0], fiedValue[1], mytext[1]);//readIndexByTerm(reader,text);
            }
        }

    } catch (Exception e) {

        e.printStackTrace();
    }
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

/**
 * sets up indexer just for reading... if needed for writing only, call
 * setupForWrite. if need both read & write, call both.
 *//*from  w w w . j  a v  a 2 s  .  co m*/
synchronized void setupForRead() {
    log.info("setting up index for read only access");
    long startTime = System.currentTimeMillis();

    //closeHandles();
    try {
        setupDirectory();

        String[] defaultSearchFields, defaultSearchFieldsOriginal;
        String[] defaultSearchFieldSubject = new String[] { "title" }; // for subject only search
        String[] defaultSearchFieldCorrespondents;
        //body field should be there, as the content of the attachment lies in this field, should also include meta field?
        //why the search over en-names and en-names-original when body/body_original is included in the search fields?
        defaultSearchFields = new String[] { "body", "title", "to_names", "from_names", "cc_names", "bcc_names",
                "to_emails", "from_emails", "cc_emails", "bcc_emails" };
        defaultSearchFieldsOriginal = new String[] { "body_original", "title" }; // we want to leave title there because we want to always hit the title -- discussed with Peter June 27 2015
        defaultSearchFieldCorrespondents = new String[] { "to_names", "from_names", "cc_names", "bcc_names",
                "to_emails", "from_emails", "cc_emails", "bcc_emails" };
        // names field added above after email discussion with Sit 6/11/2013. problem is that we're not using the Lucene EnglishPossessiveFilter, so
        // NER will extract the name Stanford University in a sentence like:
        // "This is Stanford University's website."
        // but when the user clicks on the name "Stanford University" in say monthly cards, we
        // will not match the message with this sentence because of the apostrophe.

        //for searching an attchment with fileName
        String[] metaSearchFields = new String[] { "fileName" };
        // Parse a simple query that searches for "text":
        if (parser == null) {
            //parser = new QueryParser(MUSE_LUCENE_VERSION, defaultSearchField, analyzer);
            parser = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFields, analyzer);
            parserOriginal = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldsOriginal, analyzer);
            parserSubject = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldSubject, analyzer);
            parserCorrespondents = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldCorrespondents,
                    analyzer);
            parserMeta = new MultiFieldQueryParser(LUCENE_VERSION, metaSearchFields, new KeywordAnalyzer());
        }

        /**
         * Bunch of gotchas here
         * Its a bad idea to store lucene internal docIds, as no assumptions about the internal docIds should be made;
         * not even that they are serial. When searching, lucene may ignore logically deleted docs.
         * Lucene does not handle deleted docs, and having these docs in search may bring down the search performance by 50%
         * Deleted docs are cleaned only during merging of indices.*/
        int numContentDocs = 0, numContentDeletedDocs = 0, numAttachmentDocs = 0, numAttachmentDeletedDocs = 0;
        if (DirectoryReader.indexExists(directory)) {
            DirectoryReader ireader = DirectoryReader.open(directory);
            if (ireader.numDeletedDocs() > 0)
                log.warn("!!!!!!!\nIndex reader has " + ireader.numDocs() + " doc(s) of which "
                        + ireader.numDeletedDocs() + " are deleted)\n!!!!!!!!!!");
            isearcher = new IndexSearcher(ireader);
            contentDocIds = new LinkedHashMap<>();
            numContentDocs = ireader.numDocs();
            numContentDeletedDocs = ireader.numDeletedDocs();

            Bits liveDocs = MultiFields.getLiveDocs(ireader);
            Set<String> fieldsToLoad = new HashSet<>();
            fieldsToLoad.add("docId");
            for (int i = 0; i < ireader.maxDoc(); i++) {
                org.apache.lucene.document.Document doc = ireader.document(i, fieldsToLoad);
                if (liveDocs != null && !liveDocs.get(i))
                    continue;

                if (doc == null || doc.get("docId") == null)
                    continue;
                contentDocIds.put(i, doc.get("docId"));
            }
            log.info("Loaded: " + contentDocIds.size() + " content docs");
        }

        if (DirectoryReader.indexExists(directory_blob)) {
            IndexReader ireader_blob = DirectoryReader.open(directory_blob);
            isearcher_blob = new IndexSearcher(ireader_blob); // read-only=true
            blobDocIds = new LinkedHashMap<Integer, String>();

            numAttachmentDocs = ireader_blob.numDocs();
            numAttachmentDeletedDocs = ireader_blob.numDeletedDocs();

            Bits liveDocs = MultiFields.getLiveDocs(ireader_blob);
            Set<String> fieldsToLoad = new HashSet<String>();
            fieldsToLoad.add("docId");
            for (int i = 0; i < ireader_blob.maxDoc(); i++) {
                org.apache.lucene.document.Document doc = ireader_blob.document(i, fieldsToLoad);
                if (liveDocs != null && !liveDocs.get(i))
                    continue;

                if (doc == null || doc.get("docId") == null)
                    continue;
                blobDocIds.put(i, doc.get("docId"));
            }
            log.info("Loaded: " + blobDocIds.size() + " attachment docs");
        }

        log.warn("Number of content docs: " + numContentDocs + ", number deleted: " + numContentDeletedDocs);
        log.warn("Number of attachment docs: " + numAttachmentDocs + ", number deleted: "
                + numAttachmentDeletedDocs);

        if (dirNameToDocIdMap == null)
            dirNameToDocIdMap = new LinkedHashMap<String, Map<Integer, String>>();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }
    log.info("Setting up index for read took " + (System.currentTimeMillis() - startTime) + " ms");
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

private synchronized Directory copyDirectoryExcludeFields(Directory dir, String out_basedir, String out_name,
        String... fields_to_be_removed) throws IOException {
    IndexReader reader = DirectoryReader.open(dir); // IndexReader.open(dir, true); // read-only=true

    Directory newDir = createDirectory(out_basedir, out_name);
    IndexWriter writer = openIndexWriter(newDir);
    //log.info("Removing field(s) " + Util.join(fields_to_be_removed, ", ") + " from index.");

    for (int i = 0; i < reader.numDocs(); i++) {
        org.apache.lucene.document.Document doc = reader.document(i);
        for (String field : fields_to_be_removed)
            doc.removeFields(field);//from  w  ww . j  av  a2  s. co  m
        writer.addDocument(doc);
    }

    writer.close();
    reader.close();

    return newDir;
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

private synchronized Directory copyDirectoryWithDocFilter(Directory dir, String out_basedir, String out_name,
        FilterFunctor filter_func) throws IOException {
    long startTime = System.currentTimeMillis();
    IndexReader reader = DirectoryReader.open(dir); // IndexReader.open(dir, true); // read-only=true

    Directory newDir = createDirectory(out_basedir, out_name);
    IndexWriter writer = openIndexWriter(newDir);
    //log.info("Removing field(s) " + Util.join(fields_to_be_removed, ", ") + " from index.");

    int count = 0;
    for (int i = 0; i < reader.numDocs(); i++) {
        org.apache.lucene.document.Document doc = reader.document(i);
        if (filter_func == null || filter_func.filter(doc)) {
            writer.addDocument(doc);/*from www.  jav  a  2s.  com*/
            count++;
        }
    }

    writer.close();
    reader.close();

    log.info("CopyDirectoryWithtDocFilter to dir:" + out_basedir + " name: " + baseDir + " time: "
            + (System.currentTimeMillis() - startTime) + " ms docs: " + count);
    return newDir;
}

From source file:edu.umd.umiacs.clip.tools.scor.BM25Scorer.java

License:Apache License

public BM25Scorer(IndexReader ir, String field) {
    super(ir, field);
    k1 = 1.2f;/*from ww  w .  j  a  v  a  2  s. c om*/
    b = 0.75f;
    try {
        avgdl = ir.getSumTotalTermFreq(field) / (float) ir.numDocs();
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    cache = new float[(int) (avgdl * 10)];
    for (int i = 0; i < cache.length; i++) {
        cache[i] = k1 * (1 - b + b * (i / avgdl));
    }
}

From source file:edu.umd.umiacs.clip.tools.scor.TFIDF.java

License:Apache License

public TFIDF(IndexReader ir, String field) {
    this.ir = ir;
    this.field = field;
    N = ir.numDocs();
}

From source file:edu.unika.aifb.graphindex.index.KeywordIndexBuilder.java

License:Open Source License

public void indexKeywords() throws StorageException, IOException {
    File indexDir = idxDirectory.getDirectory(IndexDirectory.KEYWORD_DIR, !resume);
    File valueDir = idxDirectory.getDirectory(IndexDirectory.VALUE_DIR, !resume);

    this.objectProperties = Util.readEdgeSet(idxDirectory.getFile(IndexDirectory.OBJECT_PROPERTIES_FILE));
    this.relations = Util.readEdgeSet(idxDirectory.getTempFile("relations", false));
    this.attributes = Util.readEdgeSet(idxDirectory.getTempFile("attributes", false));
    properties = new HashSet<String>();
    properties.addAll(relations);//from w w  w.j  a  va 2s. co  m
    properties.addAll(attributes);

    log.debug("attributes: " + attributes.size() + ", relations: " + relations.size());

    try {
        //         HyphenationCompoundWordAnalyzer analyzer = new HyphenationCompoundWordAnalyzer("./res/en_hyph_US.xml", "./res/en_US.dic");
        //         DictionaryCompoundWordAnalyzer analyzer = new DictionaryCompoundWordAnalyzer("./res/en_US.dic");
        CapitalizationSplitterAnalyzer analyzer = new CapitalizationSplitterAnalyzer();
        StandardAnalyzer valueAnalyzer = new StandardAnalyzer();
        IndexWriter indexWriter = new IndexWriter(indexDir, analyzer, !resume,
                new MaxFieldLength(MAXFIELDLENGTH));
        log.debug("max terms per field: " + indexWriter.getMaxFieldLength());

        valueWriter = new IndexWriter(valueDir, valueAnalyzer, !resume, new MaxFieldLength(MAXFIELDLENGTH));

        org.apache.lucene.index.IndexReader reader = null;
        if (resume) {
            reader = org.apache.lucene.index.IndexReader.open(FSDirectory.getDirectory(indexDir), true);
            log.debug("docs: " + reader.numDocs());
        }

        if (!resume) {
            log.info("Indexing concepts");
            indexSchema(indexWriter, idxDirectory.getTempFile("concepts", false), TypeUtil.CONCEPT,
                    CONCEPT_BOOST);

            log.info("Indexing attributes");
            indexSchema(indexWriter, idxDirectory.getTempFile("attributes", false), TypeUtil.ATTRIBUTE,
                    ATTRIBUTE_BOOST);

            log.info("Indexing relations");
            indexSchema(indexWriter, idxDirectory.getTempFile("relations", false), TypeUtil.RELATION,
                    RELATION_BOOST);
        }

        log.info("Indexing entities");
        indexEntity(indexWriter, idxDirectory.getTempFile("entities", false), reader);

        indexWriter.commit();
        valueWriter.commit();

        log.debug("optimizing...");
        indexWriter.optimize();
        valueWriter.optimize();

        indexWriter.close();
        valueWriter.close();

        if (blockSearcher != null)
            blockSearcher.close();
        ns.optimize();
        ns.close();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (DatabaseException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}