Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) {
    try {//from ww  w.j a v  a 2  s .  com
        this.indexSearcher = indexSearcher;
        this.jochreQuery = jochreQuery;
        query = rewrite(jochreQuery.getLuceneQuery());
        queryTerms = new TreeSet<Term>();
        query.extractTerms(queryTerms);
        if (LOG.isTraceEnabled())
            queryTermList = new ArrayList<Term>(queryTerms);

        final IndexReader reader = indexSearcher.getIndexReader();
        // add 1 to doc count to ensure even terms in all docs get a very small weight
        docCountLog = Math.log(reader.numDocs() + 1);

        IndexReaderContext readerContext = reader.getContext();
        leaves = readerContext.leaves();

        // since the same terms might be contained in the query multiple times (e.g. once per field)
        // we only consider them once each by using a HashSet
        terms = new HashSet<BytesRef>();
        Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>();
        for (Term term : queryTerms) {
            terms.add(term.bytes());
            termFreqs.put(term.bytes(), 0);
        }

        termLogs = new HashMap<BytesRef, Double>();
        for (Term term : queryTerms) {
            int freq = termFreqs.get(term.bytes());
            freq += reader.docFreq(term);
            termFreqs.put(term.bytes(), freq);
        }
        for (BytesRef term : terms) {
            int freq = termFreqs.get(term);
            termLogs.put(term, Math.log(freq));
        }
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1, String s2, String radioBtn)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    //getting reference of directory
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words, takes out the stop words
    Analyzer analyzer = new StandardAnalyzer();

    String contents = "contents";

    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }//  w ww .  j  a v a2s . co  m

    Query q1 = parser.parse(s1);
    Query q2 = parser.parse(s2);

    //conjuction, disjunction and negation
    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    //occur.must : both queries required in a doc
    if (radioBtn.equals("conjunction")) {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST);
        bq.build();
    } //occur.should: one of the q1 should be presen t in doc
    else if (radioBtn.equals("disjunction")) {
        bq.add(q1, BooleanClause.Occur.SHOULD);
        bq.add(q2, BooleanClause.Occur.SHOULD);
        bq.build();
    } //negation: first should present , second should not
    else {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST_NOT);
        bq.build();
    }

    TopDocs hits = searcher.search(bq.build(), 10);

    Formatter formatter = new SimpleHTMLFormatter();

    QueryScorer scorer = new QueryScorer(bq.build());

    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();

        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);

        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException {
    //Get directory reference
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));
    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);
    //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer();
    //Query parser to be used for creating TermQuery

    String queries = null;//from  w ww .j a  v a  2  s.  c o m
    String queryString = null; //regular search
    String contents = "contents";
    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }

    Query q1 = parser.parse(s1);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    bq.add(q1, BooleanClause.Occur.MUST);
    //Search the lucene documents
    TopDocs hits = searcher.search(bq.build(), 10);
    // TopScoreDocCollector collector = TopScoreDocCollector.create(5);
    /**
     * Highlighter Code Start ***
     */
    //Uses HTML &lt;B&gt;&lt;/B&gt; tag to highlight the searched terms
    Formatter formatter = new SimpleHTMLFormatter();
    //It scores cont fragments by the number of unique q1 terms found
    //Basically the matching score in layman terms
    QueryScorer scorer = new QueryScorer(bq.build());
    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);
    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();
        //int rank = hits.scoreDocs.length;
        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);
        // String title = doc.get("title");
        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.mothsoft.alexis.dao.DocumentDaoImpl.java

License:Apache License

@SuppressWarnings("unchecked")
private List<ImportantTerm> getImportantTerms(FullTextQuery fullTextQuery, int count, boolean filterStopWords) {
    final Long start = System.currentTimeMillis();
    final List<Object[]> results = fullTextQuery.list();
    final LinkedHashMap<String, Tuple<Integer, Float>> termCountMap = new LinkedHashMap<String, Tuple<Integer, Float>>();

    final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
    final SearchFactory searchFactory = fullTextSession.getSearchFactory();
    final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
    final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);
    final IndexSearcher searcher = new IndexSearcher(reader);

    final List<ImportantTerm> importantTerms;
    final int numDocs;
    try {// ww w  .  j  a v  a 2s  . c  o m
        numDocs = reader.numDocs();
        Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);

        if (logger.isDebugEnabled()) {
            logger.debug(String.format("Found %d matching Lucene documents of %d in reader", results.size(),
                    numDocs));
        }

        // loop over all the matching documents
        for (final Object[] ith : results) {
            int docId = ((Number) ith[0]).intValue();
            final TermFreqVector tfv = reader.getTermFreqVector(docId, CONTENT_TEXT_FIELD_NAME);

            if (tfv == null) {
                continue;
            }

            final String[] terms = tfv.getTerms();
            final int[] freqs = tfv.getTermFrequencies();

            // total document size
            int size = 0;

            for (int freq : freqs) {
                size += freq;
            }

            if (logger.isDebugEnabled()) {
                logger.debug(
                        String.format("Lucene document %d has %d terms, to be merged with running count %d",
                                docId, size, termCountMap.size()));
            }

            // loop over the terms and aggregate the counts and tf-idf
            int i = 0;
            for (final String term : terms) {
                if (StopWords.ENGLISH.contains(term)) {
                    continue;
                }

                luceneTerm = luceneTerm.createTerm(term);
                final int termCount = freqs[i++];

                final Tuple<Integer, Float> countScore;
                if (termCountMap.containsKey(term)) {
                    countScore = termCountMap.get(term);
                    countScore.t1 += termCount;
                    countScore.t2 += (TFIDF.score(term, termCount, size, numDocs,
                            searcher.docFreq(luceneTerm)));
                } else {
                    countScore = new Tuple<Integer, Float>();
                    countScore.t1 = termCount;
                    countScore.t2 = (TFIDF.score(term, termCount, size, numDocs, searcher.docFreq(luceneTerm)));
                    termCountMap.put(term, countScore);
                }
            }
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Completed Lucene document processing.");
        }

        importantTerms = new ArrayList<ImportantTerm>(termCountMap.size());

        // find max TF-IDF
        float maxTfIdf = 0.0f;
        for (final Tuple<Integer, Float> ith : termCountMap.values()) {
            if (ith.t2 > maxTfIdf) {
                maxTfIdf = ith.t2;
            }
        }

        for (final Map.Entry<String, Tuple<Integer, Float>> entry : termCountMap.entrySet()) {
            final int ithCount = entry.getValue().t1;
            final float ithTfIdf = entry.getValue().t2;
            importantTerms.add(new ImportantTerm(entry.getKey(), ithCount, ithTfIdf, maxTfIdf));
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Completed term aggregation, will clear term map");
        }

        termCountMap.clear();

    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        try {
            searcher.close();
        } catch (IOException e) {
            logger.warn("Failed to close searcher: " + e, e);
        }
        ira.close(reader);
    }

    if (logger.isDebugEnabled()) {
        logger.debug("Sorting terms");
    }

    Collections.sort(importantTerms, new Comparator<ImportantTerm>() {
        @Override
        public int compare(ImportantTerm term1, ImportantTerm term2) {
            return -1 * term1.getTfIdf().compareTo(term2.getTfIdf());
        }
    });

    if (logger.isDebugEnabled()) {
        logger.debug("Term sort complete");
    }

    if (importantTerms.isEmpty() || importantTerms.size() < count) {
        if (logger.isDebugEnabled()) {
            logger.debug("Will return full list.");
        }
        logger.debug("Timer: " + (System.currentTimeMillis() - start));
        return importantTerms;
    } else {
        if (logger.isDebugEnabled()) {
            logger.debug(
                    "Will return sublist containing " + count + " of " + importantTerms.size() + " terms.");
        }

        logger.debug("Timer: " + (System.currentTimeMillis() - start));
        return importantTerms.subList(0, count);
    }
}

From source file:com.mothsoft.alexis.engine.textual.TFIDFCalculatorImpl.java

License:Apache License

@SuppressWarnings("unchecked")
@Transactional/*from  w w w. ja va2 s.c om*/
public void execute() {
    final long start = System.currentTimeMillis();

    final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
    final SearchFactory searchFactory = fullTextSession.getSearchFactory();
    final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
    final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);

    final Query query = em.createQuery(
            "select d from Document d join d.documentTerms dt where dt.tfIdf IS NULL ORDER BY d.id ASC");
    final List<Document> documents = query.getResultList();

    final Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);
    int affectedRows = 0;

    try {
        for (final Document document : documents) {
            final Map<String, Float> termTfIdfMap = new HashMap<String, Float>();

            // calculate term TF-IDFs
            for (final DocumentTerm documentTerm : document.getDocumentTerms()) {
                final Term term = luceneTerm.createTerm(documentTerm.getTerm().getValueLowercase());
                Float score = TFIDF.score(documentTerm.getTerm().getValueLowercase(), documentTerm.getCount(),
                        document.getTermCount(), reader.numDocs(), reader.docFreq(term));
                documentTerm.setTfIdf(score);
                termTfIdfMap.put(documentTerm.getTerm().getValueLowercase(), score);
                affectedRows++;
            }

            // update association weights
            for (final DocumentAssociation documentAssociation : document.getDocumentAssociations()) {
                final String a = documentAssociation.getA().getValueLowercase();
                final String b = documentAssociation.getB().getValueLowercase();
                documentAssociation.setAssociationWeight((float) documentAssociation.getAssociationCount()
                        * (termTfIdfMap.get(a) + termTfIdfMap.get(b)));
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        ira.close(reader);
    }

    logger.info("TF-IDF calc took: " + ((System.currentTimeMillis() - start) / 1000.00)
            + " seconds and affected " + affectedRows + " rows.");
}

From source file:com.netspective.sparx.navigate.fts.FullTextSearchPage.java

License:Open Source License

protected void readIndexInfo(File indexDir) throws IOException {
    IndexReader indexReader = IndexReader.open(indexDir);
    totalDocsInIndex = indexReader.numDocs();

    List fields = new ArrayList();
    List indexedFields = new ArrayList();

    Iterator fieldIterator = indexReader.getFieldNames().iterator();
    while (fieldIterator.hasNext()) {
        Object field = fieldIterator.next();
        if (field != null && !field.equals(""))
            fields.add(field.toString());
    }//from w  w  w .ja  va2 s  . c  o m

    fieldIterator = indexReader.getFieldNames(true).iterator();
    while (fieldIterator.hasNext()) {
        Object field = fieldIterator.next();
        if (field != null && !field.equals(""))
            indexedFields.add(field.toString());
    }
    indexReader.close();

    allFieldNames = (String[]) fields.toArray(new String[fields.size()]);
    allIndexedFieldNames = (String[]) indexedFields.toArray(new String[indexedFields.size()]);
}

From source file:com.orientechnologies.lucene.engine.OLuceneIndexEngineAbstract.java

License:Apache License

@Override
public long sizeInTx(OLuceneTxChanges changes) {
    IndexReader reader = null;
    IndexSearcher searcher = null;//from   ww  w .ja  v  a  2s.c om
    try {
        searcher = searcher();
        if (searcher != null)
            reader = searcher.getIndexReader();
    } catch (IOException e) {
        OLogManager.instance().error(this, "Error on getting size of Lucene index", e);
    } finally {
        if (searcher != null) {
            release(searcher);
        }
    }
    return changes == null ? reader.numDocs() : reader.numDocs() + changes.numDocs();
}

From source file:com.orientechnologies.lucene.manager.OLuceneIndexManagerAbstract.java

License:Apache License

public long size(final ValuesTransformer<V> transformer) {

    IndexReader reader = null;
    IndexSearcher searcher = null;// w  ww .  jav  a  2s. com
    try {
        reader = getSearcher().getIndexReader();
    } catch (IOException e) {
        OLogManager.instance().error(this, "Error on getting size of Lucene index", e);
    } finally {
        if (searcher != null) {
            release(searcher);
        }
    }
    return reader.numDocs();
}

From source file:com.pjaol.search.geo.utils.DistanceFilter.java

License:Apache License

@Override
public BitSet bits(IndexReader reader) throws IOException {

    /* Create a BitSet to store the result */
    int maxdocs = reader.numDocs();
    BitSet bits = new BitSet(maxdocs);

    setPrecision(maxdocs);//from   w  w w  .j  a v  a  2  s .com
    /* create an intermediate cache to avoid recomputing
     distances for the same point
     TODO: Why is this a WeakHashMap? */
    WeakHashMap<String, Double> cdistance = new WeakHashMap<String, Double>(maxdocs);

    String[] latIndex = FieldCache.DEFAULT.getStrings(reader, latField);
    String[] lngIndex = FieldCache.DEFAULT.getStrings(reader, lngField);

    /* store calculated distances for reuse by other components */
    distances = new HashMap<Integer, Double>(maxdocs);
    for (int i = 0; i < maxdocs; i++) {

        String sx = latIndex[i];
        String sy = lngIndex[i];
        if (sx != null && sy != null) {

            double x = NumberUtils.SortableStr2double(sx);
            double y = NumberUtils.SortableStr2double(sy);

            // round off lat / longs if necessary
            x = DistanceHandler.getPrecision(x, precise);
            y = DistanceHandler.getPrecision(y, precise);

            String ck = new Double(x).toString() + "," + new Double(y).toString();
            Double cachedDistance = cdistance.get(ck);

            double d;

            if (cachedDistance != null) {
                d = cachedDistance.doubleValue();
            } else {
                d = DistanceUtils.getDistanceMi(lat, lng, x, y);
                cdistance.put(ck, d);
            }
            distances.put(i, d);

            if (distance < 0 || d < distance) {
                bits.set(i);
            }
        }

    }

    return bits;
}

From source file:com.qwazr.search.index.IndexStatus.java

License:Apache License

public IndexStatus(IndexReader indexReader, IndexSettingsDefinition settings, Set<String> analyzers,
        Set<String> fields) {
    num_docs = (long) indexReader.numDocs();
    num_deleted_docs = (long) indexReader.numDeletedDocs();
    this.settings = settings;
    this.analyzers = analyzers;
    this.fields = fields;
}