Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:org.apache.solr.spelling.WordBreakSolrSpellChecker.java

License:Apache License

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    IndexReader ir = options.reader;
    int numSuggestions = options.count;

    StringBuilder sb = new StringBuilder();
    Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
    List<Term> termArr = new ArrayList<Term>(options.tokens.size() + 2);

    List<ResultEntry> breakSuggestionList = new ArrayList<ResultEntry>();
    boolean lastOneProhibited = false;
    boolean lastOneRequired = false;
    boolean lastOneprocedesNewBooleanOp = false;
    for (int i = 0; i < tokenArr.length; i++) {
        boolean prohibited = (tokenArr[i].getFlags()
                & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG;
        boolean required = (tokenArr[i].getFlags()
                & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG;
        boolean procedesNewBooleanOp = (tokenArr[i].getFlags()
                & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
        if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired
                || lastOneprocedesNewBooleanOp)) {
            termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
        }//  w ww  .j  av a  2  s  .  c om
        lastOneProhibited = prohibited;
        lastOneRequired = required;
        lastOneprocedesNewBooleanOp = procedesNewBooleanOp;

        Term thisTerm = new Term(field, tokenArr[i].toString());
        termArr.add(thisTerm);
        if (breakWords) {
            SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir,
                    options.suggestMode, sortMethod);
            for (SuggestWord[] breakSuggestion : breakSuggestions) {
                sb.delete(0, sb.length());
                boolean firstOne = true;
                int freq = 0;
                for (SuggestWord word : breakSuggestion) {
                    if (!firstOne) {
                        sb.append(" ");
                    }
                    firstOne = false;
                    sb.append(word.string);
                    if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
                        freq = Math.max(freq, word.freq);
                    } else {
                        freq += word.freq;
                    }
                }
                breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
            }
        }
    }
    List<ResultEntry> combineSuggestionList = Collections.emptyList();
    CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(
            termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
    if (combineWords) {
        combineSuggestionList = new ArrayList<ResultEntry>(combineSuggestions.length);
        for (CombineSuggestion cs : combineSuggestions) {
            int firstTermIndex = cs.originalTermIndexes[0];
            int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
            sb.delete(0, sb.length());
            for (int i = firstTermIndex; i <= lastTermIndex; i++) {
                if (i > firstTermIndex) {
                    sb.append(" ");
                }
                sb.append(tokenArr[i].toString());
            }
            Token token = new Token(sb.toString(), tokenArr[firstTermIndex].startOffset(),
                    tokenArr[lastTermIndex].endOffset());
            combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
        }
    }

    // Interleave the two lists of suggestions into one SpellingResult
    SpellingResult result = new SpellingResult();
    Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
    Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
    ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
    ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
    int breakCount = 0;
    int combineCount = 0;
    while (lastBreak != null || lastCombine != null) {
        if (lastBreak == null) {
            result.add(lastCombine.token, lastCombine.suggestion, lastCombine.freq);
            result.addFrequency(lastCombine.token, getCombineFrequency(ir, lastCombine.token));
            lastCombine = null;
        } else if (lastCombine == null) {
            result.add(lastBreak.token, lastBreak.suggestion, lastBreak.freq);
            result.addFrequency(lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())));
            lastBreak = null;
        } else if (lastBreak.freq < lastCombine.freq) {
            result.add(lastCombine.token, lastCombine.suggestion, lastCombine.freq);
            result.addFrequency(lastCombine.token, getCombineFrequency(ir, lastCombine.token));
            lastCombine = null;
        } else if (lastCombine.freq < lastBreak.freq) {
            result.add(lastBreak.token, lastBreak.suggestion, lastBreak.freq);
            result.addFrequency(lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())));
            lastBreak = null;
        } else if (breakCount >= combineCount) {
            result.add(lastCombine.token, lastCombine.suggestion, lastCombine.freq);
            result.addFrequency(lastCombine.token, getCombineFrequency(ir, lastCombine.token));
            lastCombine = null;
        } else {
            result.add(lastBreak.token, lastBreak.suggestion, lastBreak.freq);
            result.addFrequency(lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())));
            lastBreak = null;
        }
        if (result.getSuggestions().size() > numSuggestions) {
            break;
        }
        if (lastBreak == null && breakIter.hasNext()) {
            lastBreak = breakIter.next();
            breakCount++;
        }
        if (lastCombine == null && combineIter.hasNext()) {
            lastCombine = combineIter.next();
            combineCount++;
        }
    }
    return result;
}

From source file:org.apache.solr.spelling.WordBreakSolrSpellChecker.java

License:Apache License

private int getCombineFrequency(IndexReader ir, Token token) throws IOException {
    String[] words = spacePattern.split(token.toString());
    int result = 0;
    if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
        for (String word : words) {
            result = Math.max(result, ir.docFreq(new Term(field, word)));
        }//from  w ww  .  j  a  v  a 2 s. c  o m
    } else {
        for (String word : words) {
            result += ir.docFreq(new Term(field, word));
        }
    }
    return result;
}

From source file:org.codesearch.searcher.server.util.STAutocompleter.java

License:Open Source License

public void setupIndex(Directory sourceDirectory, String fieldToAutocomplete)
        throws CorruptIndexException, IOException {
    IndexReader sourceReader = IndexReader.open(sourceDirectory);
    LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete);
    IndexWriter writer = new IndexWriter(autoCompleteDirectory, new STAutocompleteLuceneAnalyzer(),
            IndexWriter.MaxFieldLength.UNLIMITED);
    writer.setMergeFactor(300);/*from  ww w  .java  2 s .c o  m*/
    writer.setMaxBufferedDocs(150);
    Map<String, Integer> wordsMap = new HashMap<String, Integer>();
    Iterator<String> iter = dict.getWordsIterator();
    while (iter.hasNext()) {
        String word = iter.next();
        if (word.length() < 0) {
            continue;
        }
        wordsMap.put(word, sourceReader.docFreq(new Term(fieldToAutocomplete, word)));
    }
    LOG.info("SetupIndex: " + GRAMMED_WORDS_FIELD);
    for (String word : wordsMap.keySet()) {
        Document doc = new Document();
        doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.NOT_ANALYZED));
        LOG.info("source:" + word);
        doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.ANALYZED));
        LOG.info("grammed:" + word);
        writer.addDocument(doc);
    }
    sourceReader.close();
    writer.optimize();
    writer.close();
    setupReader();
}

From source file:org.dbpedia.spotlight.lucene.search.BaseSearcher.java

License:Apache License

/**
 * Computes a term frequency map for the index at the specified location.
 * @param // w  w  w  .j  a va  2s. c  o m
 * @return a Boolean OR query.
 * @throws Exception if one is thrown.
   * @author sujitpal (computeTopTermQuery in http://sujitpal.blogspot.com/2009/02/summarization-with-lucene.html)
   * @author pablomendes adapted from sujitpal
 */
public static List<Map.Entry<Term, Integer>> getTopTerms(IndexReader mReader) throws IOException {

    final Map<Term, Integer> frequencyMap = new HashMap<Term, Integer>();

    TermEnum terms = mReader.terms(); //TODO check what can we do about fields here. should have only top terms for context field?
    while (terms.next()) {
        Term term = terms.term();
        int frequency = mReader.docFreq(term); // DF
        frequencyMap.put(term, frequency);
    }

    // sort the term map by frequency descending
    Ordering descOrder = new Ordering<Map.Entry<Term, Integer>>() {
        public int compare(Map.Entry<Term, Integer> left, Map.Entry<Term, Integer> right) {
            return Ints.compare(right.getValue(), left.getValue());
        }
    };
    List<Map.Entry<Term, Integer>> sorted = descOrder.sortedCopy(frequencyMap.entrySet());

    return sorted;
}

From source file:org.deals.lucene.highlight.QueryTermExtractor.java

License:Apache License

/**
 * Extracts all terms texts of a given Query into an array of WeightedTerms
 *
 * @param query      Query to extract term texts from
 * @param reader used to compute IDF which can be used to a) score selected fragments better 
 * b) use graded highlights eg chaning intensity of font color
 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
 * @return an array of the terms used in a query, plus their weights.
 *//*from  w ww .j  a  v a 2s .co  m*/
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName) {
    WeightedTerm[] terms = getTerms(query, false, fieldName);
    int totalNumDocs = reader.numDocs();
    for (int i = 0; i < terms.length; i++) {
        try {
            int docFreq = reader.docFreq(new Term(fieldName, terms[i].term));
            //IDF algorithm taken from DefaultSimilarity class
            float idf = (float) (Math.log((float) totalNumDocs / (double) (docFreq + 1)) + 1.0);
            terms[i].weight *= idf;
        } catch (IOException e) {
            //ignore 
        }
    }
    return terms;
}

From source file:org.eclipse.smila.search.lucene.index.access.ExistsOperation.java

License:Open Source License

/**
 * {@inheritDoc}//from w  w w .  j  a v a2s.co m
 * 
 * @see org.eclipse.smila.search.lucene.index.access.ISynchronizedOperation#process(java.lang.Object)
 */
public Boolean process(final IndexReader object) throws IndexException {
    boolean exists;
    try {
        exists = (object.docFreq(_term) > 0);
    } catch (final Exception e) {
        throw new IndexException("Unable to check wether document exists by term [" + _term.text() + "]", e);
    }
    return exists;
}

From source file:org.karsha.base.DocIndexer.java

License:Open Source License

/**
 * This method calculates the TF-IDF score for each terms in the indexed
 * documents/*from  w  w w . ja va2s. c o m*/
 *
 * @param numberOfDocs
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScore(int numberOfDocs) throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();

    try {

        //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ;
        IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int freq[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId;
            //TermFreqVector termsFreq3[];
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String terms[];
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            termsFreq = re.getTermFreqVector(k, "doccontent");
            termsFreqDocId = re.getTermFreqVector(k, "docid");

            int aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
            freq = termsFreq.getTermFrequencies();

            terms = termsFreq.getTerms();

            int noOfTerms = terms.length;
            score = new float[noOfTerms];
            DefaultSimilarity simi = new DefaultSimilarity();
            for (i = 0; i < noOfTerms; i++) {
                int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));
                // System.out.println(terms[i]+"\t"+freq[i]);
                //int noofDocsContainTerm = docsContainTerm(terms[i], "docnames");
                float tf = simi.tf(freq[i]);
                float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                wordMap.put(terms[i], (tf * idf));

            }
            scoreMap.put(aInt, wordMap);
        }

    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }

    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 

    return scoreMap;
}

From source file:org.karsha.base.DocIndexer.java

License:Open Source License

/**
 * This method calculates the TF-IDF score for each terms including markedup
 * Taxonoomy and FIBO terms, in the indexed documents
 *
 * @param numberOfDocs/* w ww .ja v  a  2  s . c  o m*/
 * @param weight- higher weight can be given for FIBO and Taxomony terms
 * with preference
 * @return - Hashmap of TF-IDF score per each term in document wise
 * @throws CorruptIndexException
 * @throws ParseException
 */
public HashMap<Integer, HashMap> tfIdfScoreWithMarkUpTerms(int numberOfDocs, int weight)
        throws CorruptIndexException, ParseException {

    int noOfDocs = docNames.length;

    HashMap<Integer, HashMap> scoreMap = new HashMap<Integer, HashMap>();
    //HashMap<Integer, float[]> scoreMap = new HashMap<Integer, float[]>();

    try {

        //IndexReader re = IndexReader.open(FSDirectory.open(new File(pathToIndex)), true) ;
        IndexReader re = IndexReader.open(ramMemDir);

        int i = 0;
        for (int k = 0; k < numberOfDocs; k++) {
            int aInt = 0;
            //TermFreqVector termsFreqVec[];
            TermFreqVector termsFreq;
            TermFreqVector termsFreqDocId = null;
            TermFreqVector termsFreqFiboTerm;
            TermFreqVector termsFreqTaxoTerm;
            HashMap<String, Float> wordMap = new HashMap<String, Float>();
            String termsVec[][];
            int freqVec[];
            int noOfTermsVec[];
            String terms[];
            int freq[];
            int noOfTerms;
            float score[] = null;

            //termsFreq3=re.getTermFreqVectors(currentDocID);
            /*
             * getting the fields in the indexed order, Doccontent, docid,
             * fiboterms
             */

            //termsFreqVec = re.getTermFreqVectors(k);
            DefaultSimilarity simi = new DefaultSimilarity();
            for (int m = 0; m < 4; m++) {
                switch (m) {
                case 0: //doc content
                    termsFreq = re.getTermFreqVector(k, "doccontent");
                    //  freq = termsFreqVec[0].getTermFrequencies();
                    // terms = termsFreqVec[0].getTerms();
                    freq = termsFreq.getTermFrequencies();
                    terms = termsFreq.getTerms();
                    noOfTerms = terms.length;
                    score = new float[noOfTerms];

                    for (i = 0; i < noOfTerms; i++) {
                        int noofDocsContainTerm = re.docFreq(new Term("doccontent", terms[i]));

                        float tf = simi.tf(freq[i]);
                        float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                        wordMap.put(terms[i], (tf * idf));

                    }

                    break;
                case 1: // doc Id
                    termsFreqDocId = re.getTermFreqVector(k, "docid");
                    // terms = termsFreqVec[1].getTerms();
                    aInt = Integer.parseInt(termsFreqDocId.getTerms()[0]);
                    break;
                case 2: //Fiboterms
                    termsFreqFiboTerm = re.getTermFreqVector(k, "fiboterms");
                    if (termsFreqFiboTerm != null) {
                        freq = termsFreqFiboTerm.getTermFrequencies();
                        terms = termsFreqFiboTerm.getTerms();

                        noOfTerms = terms.length;

                        score = new float[noOfTerms];
                        //DefaultSimilarity simi = new DefaultSimilarity();
                        for (i = 0; i < noOfTerms; i++) {
                            int noofDocsContainTerm = re.docFreq(new Term("fiboterms", terms[i]));

                            float tf = simi.tf(freq[i]);
                            float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                            wordMap.put(terms[i], (tf * idf * weight));

                        }
                    }
                    break;
                case 3: //taxoterms
                    termsFreqTaxoTerm = re.getTermFreqVector(k, "taxoterms");
                    if (termsFreqTaxoTerm != null) {
                        freq = termsFreqTaxoTerm.getTermFrequencies();
                        terms = termsFreqTaxoTerm.getTerms();

                        noOfTerms = terms.length;

                        score = new float[noOfTerms];
                        //DefaultSimilarity simi = new DefaultSimilarity();
                        for (i = 0; i < noOfTerms; i++) {
                            int noofDocsContainTerm = re.docFreq(new Term("taxoterms", terms[i]));

                            float tf = simi.tf(freq[i]);
                            float idf = simi.idf(noofDocsContainTerm, noOfDocs);
                            wordMap.put(terms[i], (tf * idf * weight));
                        }
                    }
                    break;

                default:
                    //System.out.println("Invalid Entry!");
                }
            }

            scoreMap.put(aInt, wordMap);
        }

    } catch (IOException e) {
        // score = null;
        e.printStackTrace();
    }

    //Map<Integer,Float[]> scoreMap=new Map<Integer, Float[]>(); 

    return scoreMap;
}

From source file:org.ninit.models.bm25.BM25TermScorer.java

License:Apache License

public BM25TermScorer(IndexReader reader, TermQuery term, Similarity similarity) throws IOException {
    super(similarity);
    this.reader = reader;
    this.term = term;
    this.idf = this.getSimilarity().idf(reader.docFreq(term.getTerm()), reader.numDocs());
    this.norm = this.reader.norms(this.term.getTerm().field());
    this.av_length = BM25Parameters.getAverageLength(this.term.getTerm().field());
    this.b = BM25Parameters.getB();
    this.k1 = BM25Parameters.getK1();
    this.termDocs = this.reader.termDocs(this.term.getTerm());

}

From source file:org.opengrok.suggest.SuggesterUtils.java

License:Open Source License

private static double computeNormalizedDocumentFrequency(final IndexReader indexReader, final Term term)
        throws IOException {
    int documentFrequency = indexReader.docFreq(term);

    return ((double) documentFrequency) / indexReader.numDocs();
}