Example usage for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:ca.ualberta.entitylinking.common.indexing.DocumentIndexer.java

License:Open Source License

public void readLuceneIndex(String indexDir, String docName) {
    IndexReader reader = null;
    Map<String, Integer> name2id = null;

    //load index/*from   w  w  w  .j  a v a2 s .  co m*/
    try {
        reader = IndexReader.open(FSDirectory.open(new File(indexDir)));

        String[] stringArray = FieldCache.DEFAULT.getStrings(reader, "name");

        // build a map from string to its document id.
        name2id = new HashMap<String, Integer>();
        for (int i = 0; i < stringArray.length; i++)
            name2id.put(stringArray[i], i);
    } catch (IOException e) {
        e.printStackTrace();
    }

    //get tf-idf vector of a document.
    DefaultSimilarity simObj = new DefaultSimilarity();

    try {
        if (!name2id.containsKey(docName))
            return;

        int docId = name2id.get(docName);
        Document doc = reader.document(docId);

        TermFreqVector termVector = reader.getTermFreqVector(docId, "contents");
        int numDocs = reader.numDocs();

        int[] termFreq = termVector.getTermFrequencies();
        String[] terms = termVector.getTerms();
        for (int i = 0; i < terms.length; i++) {
            //avoid stop words
            //            if (isStopWord(terms[i]))
            //               continue;

            int tf = termFreq[i];
            int df = reader.docFreq(new Term("contents", terms[i]));
            float tfidf = simObj.tf(tf) * simObj.idf(df, numDocs);
            System.out.println(terms[i] + ": " + tfidf);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.bdaum.zoom.lal.internal.lucene.Lucene.java

License:Open Source License

public List<ScoredString> listTags(File indexPath, int maxItems) throws IOException {
    Object readerToken = null;//  w  ww  .  j ava2s .c o m
    try {
        readerToken = indexPath == null ? null : getIndexReaderToken(indexPath);
        if (readerToken != null) {
            IndexReader indexReader = readerMap.get(readerToken);
            if (indexReader != null) {
                List<ScoredString> result = new ArrayList<ScoredString>(1000);
                Terms terms = MultiFields.getTerms(indexReader, LireActivator.FIELD_NAME_FULL_TEXT);
                if (terms == null)
                    return null;
                TermsEnum termEnum = terms.iterator();
                BytesRef bytesRef;
                while ((bytesRef = termEnum.next()) != null)
                    result.add(new ScoredString(bytesRef.utf8ToString(),
                            indexReader.docFreq(new Term(LireActivator.FIELD_NAME_FULL_TEXT, bytesRef))));
                Collections.sort(result);
                return (result.size() > maxItems) ? result.subList(0, maxItems) : result;
            }
        }
        return null;
    } finally {
        if (readerToken != null)
            releaseIndexReader(indexPath, readerToken);
    }
}

From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java

License:Apache License

/**
 * Suggest similar words (restricted or not to a field of a user index)
 * @param word String the word you want a spell check done on
 * @param num_sug int the number of suggest words
 * @param ir the indexReader of the user index (can be null see field param)
 * @param field String the field of the user index: if field is not null, the suggested
 * words are restricted to the words present in this field.
 * @param morePopular boolean return only the suggest words that are more frequent than the searched word
 * (only if restricted mode = (indexReader!=null and field!=null)
 * @throws IOException/*from  w  w w.j a  va  2 s  . c o m*/
 * @return String[] the sorted list of the suggest words with this 2 criteria:
 * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
 * of the suggest words in the field of the user index
 */
public String[] suggestSimilar(String word, int num_sug, IndexReader ir, String field, boolean morePopular)
        throws IOException {

    float minScore = min;
    final TRStringDistance sd = new TRStringDistance(word);
    final int lengthWord = word.length();

    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;
    if (!morePopular && goalFreq > 0) {
        return new String[] { word }; // return the word if it exist in the index and i don't want a more popular word
    }

    BooleanQuery query = new BooleanQuery();
    String[] grams;
    String key;

    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

        key = "gram" + ng; // form key

        grams = formGrams(word, ng); // form word into ngrams (allow dups too)

        if (grams.length == 0) {
            continue; // hmm
        }
        if (bStart > 0) { // should we boost prefixes?
            add(query, "start" + ng, grams[0], bStart); // matches start of word

        }
        if (bEnd > 0) { // should we boost suffixes
            add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

        }
        for (int i = 0; i < grams.length; i++) {
            add(query, key, grams[i]);
        }

    }

    IndexSearcher searcher = new IndexSearcher(this.spellindex);
    TopDocCollector collector = new TopDocCollector(10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
    searcher.search(query, collector);
    ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs;

    SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);
    SuggestWord sugword = new SuggestWord();
    for (int i = 0; i < scoreDocs.length; i++) {

        Document doc = searcher.doc(i);
        sugword.string = doc.get(F_WORD); // get orig word)

        if (sugword.string.equals(word)) {
            continue; // don't suggest a word for itself, that would be silly
        }

        //edit distance/normalize with the min word length
        sugword.score = doc.getBoost() * (1.0f
                - ((float) sd.getDistance(sugword.string) / Math.min(sugword.string.length(), lengthWord)));
        if (sugword.score < minScore) {
            continue;
        }
        if (ir != null) { // use the user index
            sugword.freq = ir.docFreq(new Term(field, sugword.string)); // freq in the index
            if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field
                continue;
            }
        }
        sugqueue.insert(sugword);
        if (sugqueue.size() == num_sug) {
            //if queue full , maintain the min score
            minScore = ((SuggestWord) sugqueue.top()).score;
        }
        sugword = new SuggestWord();
    }

    // convert to array string
    String[] list = new String[sugqueue.size()];
    for (int i = sugqueue.size() - 1; i >= 0; i--) {
        list[i] = ((SuggestWord) sugqueue.pop()).string;
    }

    searcher.close();
    return list;
}

From source file:com.greplin.lucene.filter.PhraseFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
    PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
    int matchCount = 0;
    int readerNumber = 0;

    for (IndexReader subReader : subReaders) {
        SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
        for (int i = 0; i < this.terms.length; i++) {
            Term t = this.terms[i];
            termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
        }/*from   w  w w  .  j ava2  s.c  o  m*/

        PhraseFilterMatchList matches = null;
        TermPositions termPositions = subReader.termPositions();
        try {
            for (TermWithFrequency term : termsOrderedByFrequency) {
                if (term.docFreq == 0) {
                    break;
                }

                termPositions.seek(term.term);

                if (matches == null) {
                    // If this is the first term, collect all matches that intersect
                    // with the provided initial document set.
                    Intersection intersection = this.intersectionProvider.get(reader);

                    matches = new PhraseFilterMatchList(term.docFreq);
                    while (intersection.advanceToNextIntersection(termPositions)) {
                        int freq = termPositions.freq();
                        PhraseFilterIntList list = new PhraseFilterIntList(freq);
                        for (int i = 0; i < freq; i++) {
                            list.add(termPositions.nextPosition() - term.offset);
                        }
                        matches.add(termPositions.doc(), list);
                    }
                } else {
                    // Otherwise, intersect with the existing matches.
                    matches.intersect(termPositions, term.offset);
                }

                if (matches.getCount() == 0) {
                    break;
                }
            }
        } finally {
            termPositions.close();
        }

        if (matches != null) {
            results[readerNumber] = matches;
            matchCount += matches.getCount();
        }
        readerNumber++;
    }

    final int bitsPerIntPowerLogTwo = 5; // 2^5 = 32
    if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
        FixedBitSet result = new FixedBitSet(reader.maxDoc());
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result.set(docIds[i] + readerOffset);
                }
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return result;
    } else if (matchCount == 0) {
        return DocIdSets.EMPTY;
    } else {
        int[] result = new int[matchCount];
        int base = 0;
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result[base + i] = docIds[i] + readerOffset;
                }
                base += count;
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return new SortedIntArrayDocIdSet(result);
    }
}

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) {
    try {//from  w  w  w  .j  a  v  a  2s  .co m
        this.indexSearcher = indexSearcher;
        this.jochreQuery = jochreQuery;
        query = rewrite(jochreQuery.getLuceneQuery());
        queryTerms = new TreeSet<Term>();
        query.extractTerms(queryTerms);
        if (LOG.isTraceEnabled())
            queryTermList = new ArrayList<Term>(queryTerms);

        final IndexReader reader = indexSearcher.getIndexReader();
        // add 1 to doc count to ensure even terms in all docs get a very small weight
        docCountLog = Math.log(reader.numDocs() + 1);

        IndexReaderContext readerContext = reader.getContext();
        leaves = readerContext.leaves();

        // since the same terms might be contained in the query multiple times (e.g. once per field)
        // we only consider them once each by using a HashSet
        terms = new HashSet<BytesRef>();
        Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>();
        for (Term term : queryTerms) {
            terms.add(term.bytes());
            termFreqs.put(term.bytes(), 0);
        }

        termLogs = new HashMap<BytesRef, Double>();
        for (Term term : queryTerms) {
            int freq = termFreqs.get(term.bytes());
            freq += reader.docFreq(term);
            termFreqs.put(term.bytes(), freq);
        }
        for (BytesRef term : terms) {
            int freq = termFreqs.get(term);
            termLogs.put(term, Math.log(freq));
        }
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text,
        PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException {
    OptionQueue collectionQueue = new OptionQueue(size);
    BytesRef ref;/*from w ww.j av a  2 s .  co  m*/
    while ((ref = prefixTermsEnum.next()) != null) {
        Term term = new Term(text.field(), BytesRef.deepCopyOf(ref));
        collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option(
                new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term)));
    }

    Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>(
            new StringText(text.text()), 0, text.bytes().length);
    while (collectionQueue.size() > 0) {
        entry.addOption(collectionQueue.pop());
    }
    return entry;
}

From source file:com.mothsoft.alexis.engine.textual.TFIDFCalculatorImpl.java

License:Apache License

@SuppressWarnings("unchecked")
@Transactional//from  w  w  w  .  j  a v  a  2s . c om
public void execute() {
    final long start = System.currentTimeMillis();

    final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
    final SearchFactory searchFactory = fullTextSession.getSearchFactory();
    final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
    final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);

    final Query query = em.createQuery(
            "select d from Document d join d.documentTerms dt where dt.tfIdf IS NULL ORDER BY d.id ASC");
    final List<Document> documents = query.getResultList();

    final Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);
    int affectedRows = 0;

    try {
        for (final Document document : documents) {
            final Map<String, Float> termTfIdfMap = new HashMap<String, Float>();

            // calculate term TF-IDFs
            for (final DocumentTerm documentTerm : document.getDocumentTerms()) {
                final Term term = luceneTerm.createTerm(documentTerm.getTerm().getValueLowercase());
                Float score = TFIDF.score(documentTerm.getTerm().getValueLowercase(), documentTerm.getCount(),
                        document.getTermCount(), reader.numDocs(), reader.docFreq(term));
                documentTerm.setTfIdf(score);
                termTfIdfMap.put(documentTerm.getTerm().getValueLowercase(), score);
                affectedRows++;
            }

            // update association weights
            for (final DocumentAssociation documentAssociation : document.getDocumentAssociations()) {
                final String a = documentAssociation.getA().getValueLowercase();
                final String b = documentAssociation.getB().getValueLowercase();
                documentAssociation.setAssociationWeight((float) documentAssociation.getAssociationCount()
                        * (termTfIdfMap.get(a) + termTfIdfMap.get(b)));
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        ira.close(reader);
    }

    logger.info("TF-IDF calc took: " + ((System.currentTimeMillis() - start) / 1000.00)
            + " seconds and affected " + affectedRows + " rows.");
}

From source file:com.nearinfinity.blur.manager.IndexManager.java

License:Apache License

public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value)
        throws IOException {
    return reader.docFreq(getTerm(columnFamily, columnName, value));
}

From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java

License:Apache License

/**
 * a constructor./*from  w w  w  .  j a v  a  2 s  .  c  om*/
 * 
 * @param reader IndexReader of the index
 * @param docId document id to be highlighted
 * @param fieldName field of the document to be highlighted
 * @param fieldQuery FieldQuery object
 * @throws IOException If there is a low-level I/O error
 */
public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName,
        final SpanAwareFieldQuery fieldQuery) throws IOException {
    this.fieldName = fieldName;

    Set<String> termSet = fieldQuery.getTermSet(fieldName);
    Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName);

    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null)
        return;

    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
        // null snippet
        return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null) {
        // null snippet
        return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();
    while ((text = termsEnum.next()) != null) {
        UnicodeUtil.UTF8toUTF16(text, spare);
        final String term = spare.toString();
        if (!termSet.contains(term)) {
            continue;
        }
        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        if (dpEnum == null) {
            // null snippet
            return;
        }

        dpEnum.nextDoc();

        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
        final float weight = (float) (Math
                .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

        final int freq = dpEnum.freq();

        for (int i = 0; i < freq; i++) {
            int pos = dpEnum.nextPosition();
            if (dpEnum.startOffset() < 0) {
                return; // no offsets, null snippet
            }

            if (alwaysHighlightTermSet.contains(term)
                    || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) {
                termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
            }
        }

    }

    // sort by position
    Collections.sort(termList);
}

From source file:com.redhat.satellite.search.index.IndexManager.java

License:Open Source License

/**
 * @param indexName//from  w  ww  .ja va  2  s.  com
 * @param doc document with data to index
 * @param uniqueField field in doc which identifies this uniquely
 * @param lang language
 * @throws IndexingException
 */
public void addUniqueToIndex(String indexName, Document doc, String uniqueField, String lang)
        throws IndexingException {
    IndexReader reader = null;
    int numFound = 0;
    try {
        reader = getIndexReader(indexName, lang);
        Term term = new Term(uniqueField, doc.get(uniqueField));
        numFound = reader.docFreq(term);
    } catch (FileNotFoundException e) {
        // Index doesn't exist, so this add will be unique
        // we don't need to do anything/
    } catch (IOException e) {
        throw new IndexingException(e);
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                //
            }
        }
    }
    if (numFound > 0) {
        log.info("Found " + numFound + " <" + indexName + " docs for " + uniqueField + ":"
                + doc.get(uniqueField) + " will remove them now.");
        removeFromIndex(indexName, uniqueField, doc.get(uniqueField));
    }
    addToIndex(indexName, doc, lang);
}