Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:ca.ualberta.entitylinking.common.indexing.DocumentIndexer.java

License:Open Source License

public void readLuceneIndex(String indexDir, String docName) {
    IndexReader reader = null;
    Map<String, Integer> name2id = null;

    //load index/*from   w  w  w  .j  a v a2 s .  co m*/
    try {
        reader = IndexReader.open(FSDirectory.open(new File(indexDir)));

        String[] stringArray = FieldCache.DEFAULT.getStrings(reader, "name");

        // build a map from string to its document id.
        name2id = new HashMap<String, Integer>();
        for (int i = 0; i < stringArray.length; i++)
            name2id.put(stringArray[i], i);
    } catch (IOException e) {
        e.printStackTrace();
    }

    //get tf-idf vector of a document.
    DefaultSimilarity simObj = new DefaultSimilarity();

    try {
        if (!name2id.containsKey(docName))
            return;

        int docId = name2id.get(docName);
        Document doc = reader.document(docId);

        TermFreqVector termVector = reader.getTermFreqVector(docId, "contents");
        int numDocs = reader.numDocs();

        int[] termFreq = termVector.getTermFrequencies();
        String[] terms = termVector.getTerms();
        for (int i = 0; i < terms.length; i++) {
            //avoid stop words
            //            if (isStopWord(terms[i]))
            //               continue;

            int tf = termFreq[i];
            int df = reader.docFreq(new Term("contents", terms[i]));
            float tfidf = simObj.tf(tf) * simObj.idf(df, numDocs);
            System.out.println(terms[i] + ": " + tfidf);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.bdaum.zoom.lal.internal.lucene.Lucene.java

License:Open Source License

public List<ScoredString> listTags(File indexPath, int maxItems) throws IOException {
    Object readerToken = null;//  w  ww  .  j ava2s .c o m
    try {
        readerToken = indexPath == null ? null : getIndexReaderToken(indexPath);
        if (readerToken != null) {
            IndexReader indexReader = readerMap.get(readerToken);
            if (indexReader != null) {
                List<ScoredString> result = new ArrayList<ScoredString>(1000);
                Terms terms = MultiFields.getTerms(indexReader, LireActivator.FIELD_NAME_FULL_TEXT);
                if (terms == null)
                    return null;
                TermsEnum termEnum = terms.iterator();
                BytesRef bytesRef;
                while ((bytesRef = termEnum.next()) != null)
                    result.add(new ScoredString(bytesRef.utf8ToString(),
                            indexReader.docFreq(new Term(LireActivator.FIELD_NAME_FULL_TEXT, bytesRef))));
                Collections.sort(result);
                return (result.size() > maxItems) ? result.subList(0, maxItems) : result;
            }
        }
        return null;
    } finally {
        if (readerToken != null)
            releaseIndexReader(indexPath, readerToken);
    }
}

From source file:com.flaptor.hounder.searcher.spell.SpellChecker.java

License:Apache License

/**
 * Suggest similar words (restricted or not to a field of a user index)
 * @param word String the word you want a spell check done on
 * @param num_sug int the number of suggest words
 * @param ir the indexReader of the user index (can be null see field param)
 * @param field String the field of the user index: if field is not null, the suggested
 * words are restricted to the words present in this field.
 * @param morePopular boolean return only the suggest words that are more frequent than the searched word
 * (only if restricted mode = (indexReader!=null and field!=null)
 * @throws IOException/*from  w  w w.j a  va  2 s  . c o m*/
 * @return String[] the sorted list of the suggest words with this 2 criteria:
 * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
 * of the suggest words in the field of the user index
 */
public String[] suggestSimilar(String word, int num_sug, IndexReader ir, String field, boolean morePopular)
        throws IOException {

    float minScore = min;
    final TRStringDistance sd = new TRStringDistance(word);
    final int lengthWord = word.length();

    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(field, word)) : 0;
    if (!morePopular && goalFreq > 0) {
        return new String[] { word }; // return the word if it exist in the index and i don't want a more popular word
    }

    BooleanQuery query = new BooleanQuery();
    String[] grams;
    String key;

    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {

        key = "gram" + ng; // form key

        grams = formGrams(word, ng); // form word into ngrams (allow dups too)

        if (grams.length == 0) {
            continue; // hmm
        }
        if (bStart > 0) { // should we boost prefixes?
            add(query, "start" + ng, grams[0], bStart); // matches start of word

        }
        if (bEnd > 0) { // should we boost suffixes
            add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word

        }
        for (int i = 0; i < grams.length; i++) {
            add(query, key, grams[i]);
        }

    }

    IndexSearcher searcher = new IndexSearcher(this.spellindex);
    TopDocCollector collector = new TopDocCollector(10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
    searcher.search(query, collector);
    ScoreDoc[] scoreDocs = collector.topDocs().scoreDocs;

    SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);
    SuggestWord sugword = new SuggestWord();
    for (int i = 0; i < scoreDocs.length; i++) {

        Document doc = searcher.doc(i);
        sugword.string = doc.get(F_WORD); // get orig word)

        if (sugword.string.equals(word)) {
            continue; // don't suggest a word for itself, that would be silly
        }

        //edit distance/normalize with the min word length
        sugword.score = doc.getBoost() * (1.0f
                - ((float) sd.getDistance(sugword.string) / Math.min(sugword.string.length(), lengthWord)));
        if (sugword.score < minScore) {
            continue;
        }
        if (ir != null) { // use the user index
            sugword.freq = ir.docFreq(new Term(field, sugword.string)); // freq in the index
            if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field
                continue;
            }
        }
        sugqueue.insert(sugword);
        if (sugqueue.size() == num_sug) {
            //if queue full , maintain the min score
            minScore = ((SuggestWord) sugqueue.top()).score;
        }
        sugword = new SuggestWord();
    }

    // convert to array string
    String[] list = new String[sugqueue.size()];
    for (int i = sugqueue.size() - 1; i >= 0; i--) {
        list[i] = ((SuggestWord) sugqueue.pop()).string;
    }

    searcher.close();
    return list;
}

From source file:com.greplin.lucene.filter.PhraseFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
    PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
    int matchCount = 0;
    int readerNumber = 0;

    for (IndexReader subReader : subReaders) {
        SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
        for (int i = 0; i < this.terms.length; i++) {
            Term t = this.terms[i];
            termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
        }/*from   w  w w  .  j ava2  s.c  o  m*/

        PhraseFilterMatchList matches = null;
        TermPositions termPositions = subReader.termPositions();
        try {
            for (TermWithFrequency term : termsOrderedByFrequency) {
                if (term.docFreq == 0) {
                    break;
                }

                termPositions.seek(term.term);

                if (matches == null) {
                    // If this is the first term, collect all matches that intersect
                    // with the provided initial document set.
                    Intersection intersection = this.intersectionProvider.get(reader);

                    matches = new PhraseFilterMatchList(term.docFreq);
                    while (intersection.advanceToNextIntersection(termPositions)) {
                        int freq = termPositions.freq();
                        PhraseFilterIntList list = new PhraseFilterIntList(freq);
                        for (int i = 0; i < freq; i++) {
                            list.add(termPositions.nextPosition() - term.offset);
                        }
                        matches.add(termPositions.doc(), list);
                    }
                } else {
                    // Otherwise, intersect with the existing matches.
                    matches.intersect(termPositions, term.offset);
                }

                if (matches.getCount() == 0) {
                    break;
                }
            }
        } finally {
            termPositions.close();
        }

        if (matches != null) {
            results[readerNumber] = matches;
            matchCount += matches.getCount();
        }
        readerNumber++;
    }

    final int bitsPerIntPowerLogTwo = 5; // 2^5 = 32
    if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
        FixedBitSet result = new FixedBitSet(reader.maxDoc());
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result.set(docIds[i] + readerOffset);
                }
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return result;
    } else if (matchCount == 0) {
        return DocIdSets.EMPTY;
    } else {
        int[] result = new int[matchCount];
        int base = 0;
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result[base + i] = docIds[i] + readerOffset;
                }
                base += count;
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return new SortedIntArrayDocIdSet(result);
    }
}

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) {
    try {//from  w  w  w  .j  a  v  a  2s  .co m
        this.indexSearcher = indexSearcher;
        this.jochreQuery = jochreQuery;
        query = rewrite(jochreQuery.getLuceneQuery());
        queryTerms = new TreeSet<Term>();
        query.extractTerms(queryTerms);
        if (LOG.isTraceEnabled())
            queryTermList = new ArrayList<Term>(queryTerms);

        final IndexReader reader = indexSearcher.getIndexReader();
        // add 1 to doc count to ensure even terms in all docs get a very small weight
        docCountLog = Math.log(reader.numDocs() + 1);

        IndexReaderContext readerContext = reader.getContext();
        leaves = readerContext.leaves();

        // since the same terms might be contained in the query multiple times (e.g. once per field)
        // we only consider them once each by using a HashSet
        terms = new HashSet<BytesRef>();
        Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>();
        for (Term term : queryTerms) {
            terms.add(term.bytes());
            termFreqs.put(term.bytes(), 0);
        }

        termLogs = new HashMap<BytesRef, Double>();
        for (Term term : queryTerms) {
            int freq = termFreqs.get(term.bytes());
            freq += reader.docFreq(term);
            termFreqs.put(term.bytes(), freq);
        }
        for (BytesRef term : terms) {
            int freq = termFreqs.get(term);
            termLogs.put(term, Math.log(freq));
        }
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text,
        PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException {
    OptionQueue collectionQueue = new OptionQueue(size);
    BytesRef ref;/*from w ww.j av a  2 s .  co  m*/
    while ((ref = prefixTermsEnum.next()) != null) {
        Term term = new Term(text.field(), BytesRef.deepCopyOf(ref));
        collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option(
                new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term)));
    }

    Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>(
            new StringText(text.text()), 0, text.bytes().length);
    while (collectionQueue.size() > 0) {
        entry.addOption(collectionQueue.pop());
    }
    return entry;
}

From source file:com.mothsoft.alexis.engine.textual.TFIDFCalculatorImpl.java

License:Apache License

@SuppressWarnings("unchecked")
@Transactional//from  w  w  w  .  j  a v  a  2s . c om
public void execute() {
    final long start = System.currentTimeMillis();

    final FullTextSession fullTextSession = Search.getFullTextSession((Session) this.em.getDelegate());
    final SearchFactory searchFactory = fullTextSession.getSearchFactory();
    final IndexReaderAccessor ira = searchFactory.getIndexReaderAccessor();
    final IndexReader reader = ira.open(com.mothsoft.alexis.domain.Document.class);

    final Query query = em.createQuery(
            "select d from Document d join d.documentTerms dt where dt.tfIdf IS NULL ORDER BY d.id ASC");
    final List<Document> documents = query.getResultList();

    final Term luceneTerm = new Term(CONTENT_TEXT_FIELD_NAME);
    int affectedRows = 0;

    try {
        for (final Document document : documents) {
            final Map<String, Float> termTfIdfMap = new HashMap<String, Float>();

            // calculate term TF-IDFs
            for (final DocumentTerm documentTerm : document.getDocumentTerms()) {
                final Term term = luceneTerm.createTerm(documentTerm.getTerm().getValueLowercase());
                Float score = TFIDF.score(documentTerm.getTerm().getValueLowercase(), documentTerm.getCount(),
                        document.getTermCount(), reader.numDocs(), reader.docFreq(term));
                documentTerm.setTfIdf(score);
                termTfIdfMap.put(documentTerm.getTerm().getValueLowercase(), score);
                affectedRows++;
            }

            // update association weights
            for (final DocumentAssociation documentAssociation : document.getDocumentAssociations()) {
                final String a = documentAssociation.getA().getValueLowercase();
                final String b = documentAssociation.getB().getValueLowercase();
                documentAssociation.setAssociationWeight((float) documentAssociation.getAssociationCount()
                        * (termTfIdfMap.get(a) + termTfIdfMap.get(b)));
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    } finally {
        ira.close(reader);
    }

    logger.info("TF-IDF calc took: " + ((System.currentTimeMillis() - start) / 1000.00)
            + " seconds and affected " + affectedRows + " rows.");
}

From source file:com.nearinfinity.blur.manager.IndexManager.java

License:Apache License

public static long recordFrequency(IndexReader reader, String columnFamily, String columnName, String value)
        throws IOException {
    return reader.docFreq(getTerm(columnFamily, columnName, value));
}

From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java

License:Apache License

/**
 * a constructor./*from  w w  w  .  j a v  a  2 s  .  c  om*/
 * 
 * @param reader IndexReader of the index
 * @param docId document id to be highlighted
 * @param fieldName field of the document to be highlighted
 * @param fieldQuery FieldQuery object
 * @throws IOException If there is a low-level I/O error
 */
public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName,
        final SpanAwareFieldQuery fieldQuery) throws IOException {
    this.fieldName = fieldName;

    Set<String> termSet = fieldQuery.getTermSet(fieldName);
    Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName);

    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null)
        return;

    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
        // null snippet
        return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null) {
        // null snippet
        return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();
    while ((text = termsEnum.next()) != null) {
        UnicodeUtil.UTF8toUTF16(text, spare);
        final String term = spare.toString();
        if (!termSet.contains(term)) {
            continue;
        }
        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        if (dpEnum == null) {
            // null snippet
            return;
        }

        dpEnum.nextDoc();

        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
        final float weight = (float) (Math
                .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

        final int freq = dpEnum.freq();

        for (int i = 0; i < freq; i++) {
            int pos = dpEnum.nextPosition();
            if (dpEnum.startOffset() < 0) {
                return; // no offsets, null snippet
            }

            if (alwaysHighlightTermSet.contains(term)
                    || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) {
                termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
            }
        }

    }

    // sort by position
    Collections.sort(termList);
}

From source file:com.redhat.satellite.search.index.IndexManager.java

License:Open Source License

/**
 * @param indexName//from  w  ww  .ja va  2  s.  com
 * @param doc document with data to index
 * @param uniqueField field in doc which identifies this uniquely
 * @param lang language
 * @throws IndexingException
 */
public void addUniqueToIndex(String indexName, Document doc, String uniqueField, String lang)
        throws IndexingException {
    IndexReader reader = null;
    int numFound = 0;
    try {
        reader = getIndexReader(indexName, lang);
        Term term = new Term(uniqueField, doc.get(uniqueField));
        numFound = reader.docFreq(term);
    } catch (FileNotFoundException e) {
        // Index doesn't exist, so this add will be unique
        // we don't need to do anything/
    } catch (IOException e) {
        throw new IndexingException(e);
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                //
            }
        }
    }
    if (numFound > 0) {
        log.info("Found " + numFound + " <" + indexName + " docs for " + uniqueField + ":"
                + doc.get(uniqueField) + " will remove them now.");
        removeFromIndex(indexName, uniqueField, doc.get(uniqueField));
    }
    addToIndex(indexName, doc, lang);
}