Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.LuceneDocIterator.java

License:Open Source License

public LuceneDocIterator(IndexReader reader, Set<String> fieldsToLoad) {
    this.reader = reader;
    this.fieldsToLoad = fieldsToLoad;
    pointer = 0;/*from w ww .  jav  a2 s  . c  o  m*/
    max = reader.numDocs();
}

From source file:edu.mit.ll.vizlinc.highlight.QueryTermExtractor.java

License:Apache License

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better 
 * b) use graded highlights eg changing intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 *///w w  w  . ja  v  a 2s  . c  om
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) {
    WeightedTerm[] terms = getTerms(query, false, fieldName);
    int totalNumDocs = reader.numDocs();
    for (int i = 0; i < terms.length; i++) {
        try {
            int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
            // docFreq counts deletes
            if (totalNumDocs < docFreq) {
                docFreq = totalNumDocs;
            }
            //IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            terms[i].weight *= idf;
        } catch (IOException e) {
            //ignore 
        }
    }
    return terms;
}

From source file:edu.mit.ll.vizlinc.highlight.WeightedSpanTermExtractor.java

License:Apache License

/**
 * Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
 * <code>IndexReader</code> to properly weight terms (for gradient highlighting).
 * //from  w ww .j  a  v a 2  s.  c o  m
 * <p>
 * 
 * @param query
 *          that caused hit
 * @param tokenStream
 *          of text to be highlighted
 * @param fieldName
 *          restricts Term's used based on field name
 * @param reader
 *          to use for scoring
 * @return Map of WeightedSpanTerms with quasi tf/idf scores
 * @throws IOException
 */
public Map<String, WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream,
        String fieldName, IndexReader reader) throws IOException {
    if (fieldName != null) {
        this.fieldName = StringHelper.intern(fieldName);
    } else {
        this.fieldName = null;
    }
    this.tokenStream = tokenStream;

    Map<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>();
    extract(query, terms);

    int totalNumDocs = reader.numDocs();
    Set<String> weightedTerms = terms.keySet();
    Iterator<String> it = weightedTerms.iterator();

    try {
        while (it.hasNext()) {
            WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
            int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
            // docFreq counts deletes
            if (totalNumDocs < docFreq) {
                docFreq = totalNumDocs;
            }
            // IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            weightedSpanTerm.weight *= idf;
        }
    } finally {

        closeReaders();
    }

    return terms;
}

From source file:edu.rpi.tw.linkipedia.search.main.helper.ReadIndex.java

License:Open Source License

public static void main(String[] args) {
    try {/*from   w  w w . j a v  a  2  s. c om*/
        if (args.length < 1) {
            System.out.println("index directory");
            return;
        }

        INDEX_DIR = args[0];

        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(INDEX_DIR)));
        IndexSearcher searcher = new IndexSearcher(reader);
        System.out.println(reader.numDocs());
        while (true) {
            BufferedReader in = null;
            String text = "";
            try {
                in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
                text = in.readLine();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            String[] mytext = text.split("\\|");
            if (mytext.length > 1) {
                text = mytext[0];
            }
            if (text.contains(":")) {
                String[] fiedValue = text.split(":", 2);
                readIndexByTerm(searcher, fiedValue[0], fiedValue[1], mytext[1]);//readIndexByTerm(reader,text);
            }
        }

    } catch (Exception e) {

        e.printStackTrace();
    }
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

/**
 * sets up indexer just for reading... if needed for writing only, call
 * setupForWrite. if need both read & write, call both.
 *//*from  w w w . j  a v  a 2 s  .  co m*/
synchronized void setupForRead() {
    log.info("setting up index for read only access");
    long startTime = System.currentTimeMillis();

    //closeHandles();
    try {
        setupDirectory();

        String[] defaultSearchFields, defaultSearchFieldsOriginal;
        String[] defaultSearchFieldSubject = new String[] { "title" }; // for subject only search
        String[] defaultSearchFieldCorrespondents;
        //body field should be there, as the content of the attachment lies in this field, should also include meta field?
        //why the search over en-names and en-names-original when body/body_original is included in the search fields?
        defaultSearchFields = new String[] { "body", "title", "to_names", "from_names", "cc_names", "bcc_names",
                "to_emails", "from_emails", "cc_emails", "bcc_emails" };
        defaultSearchFieldsOriginal = new String[] { "body_original", "title" }; // we want to leave title there because we want to always hit the title -- discussed with Peter June 27 2015
        defaultSearchFieldCorrespondents = new String[] { "to_names", "from_names", "cc_names", "bcc_names",
                "to_emails", "from_emails", "cc_emails", "bcc_emails" };
        // names field added above after email discussion with Sit 6/11/2013. problem is that we're not using the Lucene EnglishPossessiveFilter, so
        // NER will extract the name Stanford University in a sentence like:
        // "This is Stanford University's website."
        // but when the user clicks on the name "Stanford University" in say monthly cards, we
        // will not match the message with this sentence because of the apostrophe.

        //for searching an attchment with fileName
        String[] metaSearchFields = new String[] { "fileName" };
        // Parse a simple query that searches for "text":
        if (parser == null) {
            //parser = new QueryParser(MUSE_LUCENE_VERSION, defaultSearchField, analyzer);
            parser = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFields, analyzer);
            parserOriginal = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldsOriginal, analyzer);
            parserSubject = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldSubject, analyzer);
            parserCorrespondents = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldCorrespondents,
                    analyzer);
            parserMeta = new MultiFieldQueryParser(LUCENE_VERSION, metaSearchFields, new KeywordAnalyzer());
        }

        /**
         * Bunch of gotchas here
         * Its a bad idea to store lucene internal docIds, as no assumptions about the internal docIds should be made;
         * not even that they are serial. When searching, lucene may ignore logically deleted docs.
         * Lucene does not handle deleted docs, and having these docs in search may bring down the search performance by 50%
         * Deleted docs are cleaned only during merging of indices.*/
        int numContentDocs = 0, numContentDeletedDocs = 0, numAttachmentDocs = 0, numAttachmentDeletedDocs = 0;
        if (DirectoryReader.indexExists(directory)) {
            DirectoryReader ireader = DirectoryReader.open(directory);
            if (ireader.numDeletedDocs() > 0)
                log.warn("!!!!!!!\nIndex reader has " + ireader.numDocs() + " doc(s) of which "
                        + ireader.numDeletedDocs() + " are deleted)\n!!!!!!!!!!");
            isearcher = new IndexSearcher(ireader);
            contentDocIds = new LinkedHashMap<>();
            numContentDocs = ireader.numDocs();
            numContentDeletedDocs = ireader.numDeletedDocs();

            Bits liveDocs = MultiFields.getLiveDocs(ireader);
            Set<String> fieldsToLoad = new HashSet<>();
            fieldsToLoad.add("docId");
            for (int i = 0; i < ireader.maxDoc(); i++) {
                org.apache.lucene.document.Document doc = ireader.document(i, fieldsToLoad);
                if (liveDocs != null && !liveDocs.get(i))
                    continue;

                if (doc == null || doc.get("docId") == null)
                    continue;
                contentDocIds.put(i, doc.get("docId"));
            }
            log.info("Loaded: " + contentDocIds.size() + " content docs");
        }

        if (DirectoryReader.indexExists(directory_blob)) {
            IndexReader ireader_blob = DirectoryReader.open(directory_blob);
            isearcher_blob = new IndexSearcher(ireader_blob); // read-only=true
            blobDocIds = new LinkedHashMap<Integer, String>();

            numAttachmentDocs = ireader_blob.numDocs();
            numAttachmentDeletedDocs = ireader_blob.numDeletedDocs();

            Bits liveDocs = MultiFields.getLiveDocs(ireader_blob);
            Set<String> fieldsToLoad = new HashSet<String>();
            fieldsToLoad.add("docId");
            for (int i = 0; i < ireader_blob.maxDoc(); i++) {
                org.apache.lucene.document.Document doc = ireader_blob.document(i, fieldsToLoad);
                if (liveDocs != null && !liveDocs.get(i))
                    continue;

                if (doc == null || doc.get("docId") == null)
                    continue;
                blobDocIds.put(i, doc.get("docId"));
            }
            log.info("Loaded: " + blobDocIds.size() + " attachment docs");
        }

        log.warn("Number of content docs: " + numContentDocs + ", number deleted: " + numContentDeletedDocs);
        log.warn("Number of attachment docs: " + numAttachmentDocs + ", number deleted: "
                + numAttachmentDeletedDocs);

        if (dirNameToDocIdMap == null)
            dirNameToDocIdMap = new LinkedHashMap<String, Map<Integer, String>>();
    } catch (Exception e) {
        Util.print_exception(e, log);
    }
    log.info("Setting up index for read took " + (System.currentTimeMillis() - startTime) + " ms");
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

private synchronized Directory copyDirectoryExcludeFields(Directory dir, String out_basedir, String out_name,
        String... fields_to_be_removed) throws IOException {
    IndexReader reader = DirectoryReader.open(dir); // IndexReader.open(dir, true); // read-only=true

    Directory newDir = createDirectory(out_basedir, out_name);
    IndexWriter writer = openIndexWriter(newDir);
    //log.info("Removing field(s) " + Util.join(fields_to_be_removed, ", ") + " from index.");

    for (int i = 0; i < reader.numDocs(); i++) {
        org.apache.lucene.document.Document doc = reader.document(i);
        for (String field : fields_to_be_removed)
            doc.removeFields(field);//from  w  ww . j  av  a2  s. co  m
        writer.addDocument(doc);
    }

    writer.close();
    reader.close();

    return newDir;
}

From source file:edu.stanford.muse.index.Indexer.java

License:Apache License

private synchronized Directory copyDirectoryWithDocFilter(Directory dir, String out_basedir, String out_name,
        FilterFunctor filter_func) throws IOException {
    long startTime = System.currentTimeMillis();
    IndexReader reader = DirectoryReader.open(dir); // IndexReader.open(dir, true); // read-only=true

    Directory newDir = createDirectory(out_basedir, out_name);
    IndexWriter writer = openIndexWriter(newDir);
    //log.info("Removing field(s) " + Util.join(fields_to_be_removed, ", ") + " from index.");

    int count = 0;
    for (int i = 0; i < reader.numDocs(); i++) {
        org.apache.lucene.document.Document doc = reader.document(i);
        if (filter_func == null || filter_func.filter(doc)) {
            writer.addDocument(doc);/*from www.  jav  a  2s.  com*/
            count++;
        }
    }

    writer.close();
    reader.close();

    log.info("CopyDirectoryWithtDocFilter to dir:" + out_basedir + " name: " + baseDir + " time: "
            + (System.currentTimeMillis() - startTime) + " ms docs: " + count);
    return newDir;
}

From source file:edu.umd.umiacs.clip.tools.scor.BM25Scorer.java

License:Apache License

public BM25Scorer(IndexReader ir, String field) {
    super(ir, field);
    k1 = 1.2f;/*from ww  w .  j  a  v  a  2  s. c om*/
    b = 0.75f;
    try {
        avgdl = ir.getSumTotalTermFreq(field) / (float) ir.numDocs();
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
    cache = new float[(int) (avgdl * 10)];
    for (int i = 0; i < cache.length; i++) {
        cache[i] = k1 * (1 - b + b * (i / avgdl));
    }
}

From source file:edu.umd.umiacs.clip.tools.scor.TFIDF.java

License:Apache License

public TFIDF(IndexReader ir, String field) {
    this.ir = ir;
    this.field = field;
    N = ir.numDocs();
}

From source file:edu.unika.aifb.graphindex.index.KeywordIndexBuilder.java

License:Open Source License

public void indexKeywords() throws StorageException, IOException {
    File indexDir = idxDirectory.getDirectory(IndexDirectory.KEYWORD_DIR, !resume);
    File valueDir = idxDirectory.getDirectory(IndexDirectory.VALUE_DIR, !resume);

    this.objectProperties = Util.readEdgeSet(idxDirectory.getFile(IndexDirectory.OBJECT_PROPERTIES_FILE));
    this.relations = Util.readEdgeSet(idxDirectory.getTempFile("relations", false));
    this.attributes = Util.readEdgeSet(idxDirectory.getTempFile("attributes", false));
    properties = new HashSet<String>();
    properties.addAll(relations);//from w w  w.j  a  va 2s. co  m
    properties.addAll(attributes);

    log.debug("attributes: " + attributes.size() + ", relations: " + relations.size());

    try {
        //         HyphenationCompoundWordAnalyzer analyzer = new HyphenationCompoundWordAnalyzer("./res/en_hyph_US.xml", "./res/en_US.dic");
        //         DictionaryCompoundWordAnalyzer analyzer = new DictionaryCompoundWordAnalyzer("./res/en_US.dic");
        CapitalizationSplitterAnalyzer analyzer = new CapitalizationSplitterAnalyzer();
        StandardAnalyzer valueAnalyzer = new StandardAnalyzer();
        IndexWriter indexWriter = new IndexWriter(indexDir, analyzer, !resume,
                new MaxFieldLength(MAXFIELDLENGTH));
        log.debug("max terms per field: " + indexWriter.getMaxFieldLength());

        valueWriter = new IndexWriter(valueDir, valueAnalyzer, !resume, new MaxFieldLength(MAXFIELDLENGTH));

        org.apache.lucene.index.IndexReader reader = null;
        if (resume) {
            reader = org.apache.lucene.index.IndexReader.open(FSDirectory.getDirectory(indexDir), true);
            log.debug("docs: " + reader.numDocs());
        }

        if (!resume) {
            log.info("Indexing concepts");
            indexSchema(indexWriter, idxDirectory.getTempFile("concepts", false), TypeUtil.CONCEPT,
                    CONCEPT_BOOST);

            log.info("Indexing attributes");
            indexSchema(indexWriter, idxDirectory.getTempFile("attributes", false), TypeUtil.ATTRIBUTE,
                    ATTRIBUTE_BOOST);

            log.info("Indexing relations");
            indexSchema(indexWriter, idxDirectory.getTempFile("relations", false), TypeUtil.RELATION,
                    RELATION_BOOST);
        }

        log.info("Indexing entities");
        indexEntity(indexWriter, idxDirectory.getTempFile("entities", false), reader);

        indexWriter.commit();
        valueWriter.commit();

        log.debug("optimizing...");
        indexWriter.optimize();
        valueWriter.optimize();

        indexWriter.close();
        valueWriter.close();

        if (blockSearcher != null)
            blockSearcher.close();
        ns.optimize();
        ns.close();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (DatabaseException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}