Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) {
    try {//from ww  w.j a v  a 2  s .  com
        this.indexSearcher = indexSearcher;
        this.jochreQuery = jochreQuery;
        query = rewrite(jochreQuery.getLuceneQuery());
        queryTerms = new TreeSet<Term>();
        query.extractTerms(queryTerms);
        if (LOG.isTraceEnabled())
            queryTermList = new ArrayList<Term>(queryTerms);

        final IndexReader reader = indexSearcher.getIndexReader();
        // add 1 to doc count to ensure even terms in all docs get a very small weight
        docCountLog = Math.log(reader.numDocs() + 1);

        IndexReaderContext readerContext = reader.getContext();
        leaves = readerContext.leaves();

        // since the same terms might be contained in the query multiple times (e.g. once per field)
        // we only consider them once each by using a HashSet
        terms = new HashSet<BytesRef>();
        Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>();
        for (Term term : queryTerms) {
            terms.add(term.bytes());
            termFreqs.put(term.bytes(), 0);
        }

        termLogs = new HashMap<BytesRef, Double>();
        for (Term term : queryTerms) {
            int freq = termFreqs.get(term.bytes());
            freq += reader.docFreq(term);
            termFreqs.put(term.bytes(), freq);
        }
        for (BytesRef term : terms) {
            int freq = termFreqs.get(term);
            termLogs.put(term, Math.log(freq));
        }
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1, String s2, String radioBtn)
        throws IOException, ParseException, InvalidTokenOffsetsException {
    //getting reference of directory
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));

    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words, takes out the stop words
    Analyzer analyzer = new StandardAnalyzer();

    String contents = "contents";

    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }//  w ww .  j  a v a2s . co  m

    Query q1 = parser.parse(s1);
    Query q2 = parser.parse(s2);

    //conjuction, disjunction and negation
    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    //occur.must : both queries required in a doc
    if (radioBtn.equals("conjunction")) {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST);
        bq.build();
    } //occur.should: one of the q1 should be presen t in doc
    else if (radioBtn.equals("disjunction")) {
        bq.add(q1, BooleanClause.Occur.SHOULD);
        bq.add(q2, BooleanClause.Occur.SHOULD);
        bq.build();
    } //negation: first should present , second should not
    else {
        bq.add(q1, BooleanClause.Occur.MUST);
        bq.add(q2, BooleanClause.Occur.MUST_NOT);
        bq.build();
    }

    TopDocs hits = searcher.search(bq.build(), 10);

    Formatter formatter = new SimpleHTMLFormatter();

    QueryScorer scorer = new QueryScorer(bq.build());

    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);

    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();

        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);

        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.main.Searcher.java

public List<Bean> searching(String s1) throws IOException, ParseException, InvalidTokenOffsetsException {
    //Get directory reference
    Directory dir = FSDirectory.open(Paths.get(Index_Dir));
    //Index reader - an interface for accessing a point-in-time view of a lucene index
    IndexReader reader = DirectoryReader.open(dir);
    //CreateIndexReader reader = DirectoryReader.open(dir); lucene searcher. It search over a single IndexReader.
    IndexSearcher searcher = new IndexSearcher(reader);
    //analyzer with the default stop words
    Analyzer analyzer = new StandardAnalyzer();
    //Query parser to be used for creating TermQuery

    String queries = null;//from  w ww .j a  v a  2  s.  c o m
    String queryString = null; //regular search
    String contents = "contents";
    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(contents, analyzer);

    int numOfDoc = reader.numDocs();

    for (int i = 0; i < numOfDoc; i++) {

        Document d = reader.document(i);

    }

    Query q1 = parser.parse(s1);

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    bq.add(q1, BooleanClause.Occur.MUST);
    //Search the lucene documents
    TopDocs hits = searcher.search(bq.build(), 10);
    // TopScoreDocCollector collector = TopScoreDocCollector.create(5);
    /**
     * Highlighter Code Start ***
     */
    //Uses HTML &lt;B&gt;&lt;/B&gt; tag to highlight the searched terms
    Formatter formatter = new SimpleHTMLFormatter();
    //It scores cont fragments by the number of unique q1 terms found
    //Basically the matching score in layman terms
    QueryScorer scorer = new QueryScorer(bq.build());
    //used to markup highlighted terms found in the best sections of a cont
    Highlighter highlighter = new Highlighter(formatter, scorer);
    //It breaks cont up into same-size texts but does not split up spans
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, 10);
    //breaks cont up into same-size fragments with no concerns over spotting sentence boundaries.

    //set fragmenter to highlighter
    highlighter.setTextFragmenter(fragmenter);
    //Iterate over found results
    for (int i = 0; i < hits.scoreDocs.length; i++) {
        Bean bean = new Bean();
        //int rank = hits.scoreDocs.length;
        int outResult = hits.scoreDocs.length;
        bean.setNumFile(outResult);
        int docid = hits.scoreDocs[i].doc;
        double rank = hits.scoreDocs[i].score;
        bean.setRankSc(rank);
        Document doc = searcher.doc(docid);
        // String title = doc.get("title");
        String name = doc.get("name");
        String title = doc.get("title");
        bean.setTitle(name);

        String path = doc.get("path");
        bean.setPath(path);

        String cont = doc.get("contents");
        //Create token stream
        TokenStream stream = TokenSources.getAnyTokenStream(reader, docid, "contents", analyzer);
        //Get highlighted cont fragments
        String[] frags = highlighter.getBestFragments(stream, cont, 10);

        ArrayList<String> dummy = new ArrayList<>();
        for (String frag : frags) {

            dummy.add(frag);
        }

        bean.setContent(dummy);
        beanList.add(bean);
    }

    dir.close();
    // }
    return beanList;
}

From source file:com.mothsoft.alexis.dao.DocumentDaoImpl.java

License:Apache License

@SuppressWarnings("unchecked")
private List<ImportantTerm> getImportantTerms(FullTextQuery fullTextQuery, int count, boolean filterStopWords) {
    final Long start = System.currentTimeMillis();
    final List<Object[]> results = fullTextQuery.list();
    final LinkedHashMap<String, Tuple<Integer, Float>> termCountMap = new LinkedHashMap<String, Tuple<Integer, Float>>();

    final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
    final SearchFactory searchFactory = fullTextSession.getSearchFactory();
    final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
    final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);
    final IndexSearcher searcher = new IndexSearcher(reader);

    final List<ImportantTerm> importantTerms;
    final int numDocs;
    try {// ww w  .  j  a v  a 2s  . c  o m
        numDocs = reader.numDocs();
        Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);

        if (logger.isDebugEnabled()) {
            logger.debug(String.format("Found %d matching Lucene documents of %d in reader", results.size(),
                    numDocs));
        }

        // loop over all the matching documents
        for (final Object[] ith : results) {
            int docId = ((Number) ith[0]).intValue();
            final TermFreqVector tfv = reader.getTermFreqVector(docId, CONTENT_TEXT_FIELD_NAME);

            if (tfv == null) {
                continue;
            }

            final String[] terms = tfv.getTerms();
            final int[] freqs = tfv.getTermFrequencies();

            // total document size
            int size = 0;

            for (int freq : freqs) {
                size += freq;
            }

            if (logger.isDebugEnabled()) {
                logger.debug(
                        String.format("Lucene document %d has %d terms, to be merged with running count %d",
                                docId, size, termCountMap.size()));
            }

            // loop over the terms and aggregate the counts and tf-idf
            int i = 0;
            for (final String term : terms) {
                if (StopWords.ENGLISH.contains(term)) {
                    continue;
                }

                luceneTerm = luceneTerm.createTerm(term);
                final int termCount = freqs[i++];

                final Tuple<Integer, Float> countScore;
                if (termCountMap.containsKey(term)) {
                    countScore = termCountMap.get(term);
                    countScore.t1 += termCount;
                    countScore.t2 += (TFIDF.score(term, termCount, size, numDocs,
                            searcher.docFreq(luceneTerm)));
                } else {
                    countScore = new Tuple<Integer, Float>();
                    countScore.t1 = termCount;
                    countScore.t2 = (TFIDF.score(term, termCount, size, numDocs, searcher.docFreq(luceneTerm)));
                    termCountMap.put(term, countScore);
                }
            }
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Completed Lucene document processing.");
        }

        importantTerms = new ArrayList<ImportantTerm>(termCountMap.size());

        // find max TF-IDF
        float maxTfIdf = 0.0f;
        for (final Tuple<Integer, Float> ith : termCountMap.values()) {
            if (ith.t2 > maxTfIdf) {
                maxTfIdf = ith.t2;
            }
        }

        for (final Map.Entry<String, Tuple<Integer, Float>> entry : termCountMap.entrySet()) {
            final int ithCount = entry.getValue().t1;
            final float ithTfIdf = entry.getValue().t2;
            importantTerms.add(new ImportantTerm(entry.getKey(), ithCount, ithTfIdf, maxTfIdf));
        }

        if (logger.isDebugEnabled()) {
            logger.debug("Completed term aggregation, will clear term map");
        }

        termCountMap.clear();

    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        try {
            searcher.close();
        } catch (IOException e) {
            logger.warn("Failed to close searcher: " + e, e);
        }
        ira.close(reader);
    }

    if (logger.isDebugEnabled()) {
        logger.debug("Sorting terms");
    }

    Collections.sort(importantTerms, new Comparator<ImportantTerm>() {
        @Override
        public int compare(ImportantTerm term1, ImportantTerm term2) {
            return -1 * term1.getTfIdf().compareTo(term2.getTfIdf());
        }
    });

    if (logger.isDebugEnabled()) {
        logger.debug("Term sort complete");
    }

    if (importantTerms.isEmpty() || importantTerms.size() < count) {
        if (logger.isDebugEnabled()) {
            logger.debug("Will return full list.");
        }
        logger.debug("Timer: " + (System.currentTimeMillis() - start));
        return importantTerms;
    } else {
        if (logger.isDebugEnabled()) {
            logger.debug(
                    "Will return sublist containing " + count + " of " + importantTerms.size() + " terms.");
        }

        logger.debug("Timer: " + (System.currentTimeMillis() - start));
        return importantTerms.subList(0, count);
    }
}

From source file:com.mothsoft.alexis.engine.textual.TFIDFCalculatorImpl.java

License:Apache License

@SuppressWarnings("unchecked")
@Transactional/*from  w w w. ja va2 s.c om*/
public void execute() {
    final long start = System.currentTimeMillis();

    final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
    final SearchFactory searchFactory = fullTextSession.getSearchFactory();
    final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
    final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);

    final Query query = em.createQuery(
            "select d from Document d join d.documentTerms dt where dt.tfIdf IS NULL ORDER BY d.id ASC");
    final List<Document> documents = query.getResultList();

    final Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);
    int affectedRows = 0;

    try {
        for (final Document document : documents) {
            final Map<String, Float> termTfIdfMap = new HashMap<String, Float>();

            // calculate term TF-IDFs
            for (final DocumentTerm documentTerm : document.getDocumentTerms()) {
                final Term term = luceneTerm.createTerm(documentTerm.getTerm().getValueLowercase());
                Float score = TFIDF.score(documentTerm.getTerm().getValueLowercase(), documentTerm.getCount(),
                        document.getTermCount(), reader.numDocs(), reader.docFreq(term));
                documentTerm.setTfIdf(score);
                termTfIdfMap.put(documentTerm.getTerm().getValueLowercase(), score);
                affectedRows++;
            }

            // update association weights
            for (final DocumentAssociation documentAssociation : document.getDocumentAssociations()) {
                final String a = documentAssociation.getA().getValueLowercase();
                final String b = documentAssociation.getB().getValueLowercase();
                documentAssociation.setAssociationWeight((float) documentAssociation.getAssociationCount()
                        * (termTfIdfMap.get(a) + termTfIdfMap.get(b)));
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        ira.close(reader);
    }

    logger.info("TF-IDF calc took: " + ((System.currentTimeMillis() - start) / 1000.00)
            + " seconds and affected " + affectedRows + " rows.");
}

From source file:com.netspective.sparx.navigate.fts.FullTextSearchPage.java

License:Open Source License

protected void readIndexInfo(File indexDir) throws IOException {
    IndexReader indexReader = IndexReader.open(indexDir);
    totalDocsInIndex = indexReader.numDocs();

    List fields = new ArrayList();
    List indexedFields = new ArrayList();

    Iterator fieldIterator = indexReader.getFieldNames().iterator();
    while (fieldIterator.hasNext()) {
        Object field = fieldIterator.next();
        if (field != null && !field.equals(""))
            fields.add(field.toString());
    }//from w  w  w .ja  va2 s  . c  o m

    fieldIterator = indexReader.getFieldNames(true).iterator();
    while (fieldIterator.hasNext()) {
        Object field = fieldIterator.next();
        if (field != null && !field.equals(""))
            indexedFields.add(field.toString());
    }
    indexReader.close();

    allFieldNames = (String[]) fields.toArray(new String[fields.size()]);
    allIndexedFieldNames = (String[]) indexedFields.toArray(new String[indexedFields.size()]);
}

From source file:com.orientechnologies.lucene.engine.OLuceneIndexEngineAbstract.java

License:Apache License

@Override
public long sizeInTx(OLuceneTxChanges changes) {
    IndexReader reader = null;
    IndexSearcher searcher = null;//from   ww  w .ja  v  a  2s.c om
    try {
        searcher = searcher();
        if (searcher != null)
            reader = searcher.getIndexReader();
    } catch (IOException e) {
        OLogManager.instance().error(this, "Error on getting size of Lucene index", e);
    } finally {
        if (searcher != null) {
            release(searcher);
        }
    }
    return changes == null ? reader.numDocs() : reader.numDocs() + changes.numDocs();
}

From source file:com.orientechnologies.lucene.manager.OLuceneIndexManagerAbstract.java

License:Apache License

public long size(final ValuesTransformer<V> transformer) {

    IndexReader reader = null;
    IndexSearcher searcher = null;// w  ww .  jav  a  2s. com
    try {
        reader = getSearcher().getIndexReader();
    } catch (IOException e) {
        OLogManager.instance().error(this, "Error on getting size of Lucene index", e);
    } finally {
        if (searcher != null) {
            release(searcher);
        }
    }
    return reader.numDocs();
}

From source file:com.pjaol.search.geo.utils.DistanceFilter.java

License:Apache License

@Override
public BitSet bits(IndexReader reader) throws IOException {

    /* Create a BitSet to store the result */
    int maxdocs = reader.numDocs();
    BitSet bits = new BitSet(maxdocs);

    setPrecision(maxdocs);//from   w  w w  .j  a v  a  2  s .com
    /* create an intermediate cache to avoid recomputing
     distances for the same point
     TODO: Why is this a WeakHashMap? */
    WeakHashMap<String, Double> cdistance = new WeakHashMap<String, Double>(maxdocs);

    String[] latIndex = FieldCache.DEFAULT.getStrings(reader, latField);
    String[] lngIndex = FieldCache.DEFAULT.getStrings(reader, lngField);

    /* store calculated distances for reuse by other components */
    distances = new HashMap<Integer, Double>(maxdocs);
    for (int i = 0; i < maxdocs; i++) {

        String sx = latIndex[i];
        String sy = lngIndex[i];
        if (sx != null && sy != null) {

            double x = NumberUtils.SortableStr2double(sx);
            double y = NumberUtils.SortableStr2double(sy);

            // round off lat / longs if necessary
            x = DistanceHandler.getPrecision(x, precise);
            y = DistanceHandler.getPrecision(y, precise);

            String ck = new Double(x).toString() + "," + new Double(y).toString();
            Double cachedDistance = cdistance.get(ck);

            double d;

            if (cachedDistance != null) {
                d = cachedDistance.doubleValue();
            } else {
                d = DistanceUtils.getDistanceMi(lat, lng, x, y);
                cdistance.put(ck, d);
            }
            distances.put(i, d);

            if (distance < 0 || d < distance) {
                bits.set(i);
            }
        }

    }

    return bits;
}

From source file:com.qwazr.search.index.IndexStatus.java

License:Apache License

public IndexStatus(IndexReader indexReader, IndexSettingsDefinition settings, Set<String> analyzers,
        Set<String> fields) {
    num_docs = (long) indexReader.numDocs();
    num_deleted_docs = (long) indexReader.numDeletedDocs();
    this.settings = settings;
    this.analyzers = analyzers;
    this.fields = fields;
}