Example usage for org.apache.lucene.search.spell DirectSpellChecker setThresholdFrequency

List of usage examples for org.apache.lucene.search.spell DirectSpellChecker setThresholdFrequency

Introduction

In this page you can find the example usage for org.apache.lucene.search.spell DirectSpellChecker setThresholdFrequency.

Prototype

public void setThresholdFrequency(float thresholdFrequency) 

Source Link

Document

Set the minimal threshold of documents a term must appear for a match.

Usage

From source file:org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.java

License:Apache License

public DirectSpellChecker createDirectSpellChecker() {

    DirectSpellChecker directSpellChecker = new DirectSpellChecker();
    directSpellChecker.setAccuracy(accuracy());
    Comparator<SuggestWord> comparator;
    switch (sort()) {
    case SCORE:/*from w  w  w.ja v a2  s .c  o  m*/
        comparator = SCORE_COMPARATOR;
        break;
    case FREQUENCY:
        comparator = LUCENE_FREQUENCY;
        break;
    default:
        throw new IllegalArgumentException("Illegal suggest sort: " + sort());
    }
    directSpellChecker.setComparator(comparator);
    directSpellChecker.setDistance(stringDistance());
    directSpellChecker.setMaxEdits(maxEdits());
    directSpellChecker.setMaxInspections(maxInspections());
    directSpellChecker.setMaxQueryFrequency(maxTermFreq());
    directSpellChecker.setMinPrefix(prefixLength());
    directSpellChecker.setMinQueryLength(minWordLength());
    directSpellChecker.setThresholdFrequency(minDocFreq());
    directSpellChecker.setLowerCaseTerms(false);
    return directSpellChecker;
}

From source file:org.elasticsearch.search.suggest.SuggestUtils.java

License:Apache License

public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {

    DirectSpellChecker directSpellChecker = new DirectSpellChecker();
    directSpellChecker.setAccuracy(suggestion.accuracy());
    Comparator<SuggestWord> comparator;
    switch (suggestion.sort()) {
    case SCORE:/*from w  w w .  j  a v  a 2s  . c om*/
        comparator = SCORE_COMPARATOR;
        break;
    case FREQUENCY:
        comparator = LUCENE_FREQUENCY;
        break;
    default:
        throw new ElasticsearchIllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
    }
    directSpellChecker.setComparator(comparator);
    directSpellChecker.setDistance(suggestion.stringDistance());
    directSpellChecker.setMaxEdits(suggestion.maxEdits());
    directSpellChecker.setMaxInspections(suggestion.maxInspections());
    directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
    directSpellChecker.setMinPrefix(suggestion.prefixLength());
    directSpellChecker.setMinQueryLength(suggestion.minWordLength());
    directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
    directSpellChecker.setLowerCaseTerms(false);
    return directSpellChecker;
}

From source file:perf.CreateQueries.java

License:Apache License

private static void makeFuzzyAndRespellQueries(IndexReader r, String field, TermFreq[] topTerms,
        Writer queriesOut) throws IOException {

    System.out.println("\nFind top fuzzy/respell terms...");
    final DirectSpellChecker spellChecker = new DirectSpellChecker();
    spellChecker.setThresholdFrequency(1.0f);

    final MostFrequentTerms pq = new MostFrequentTerms(NUM_QUERIES);

    // TODO: use threads...?
    int count = 0;
    for (TermFreq tdf : topTerms) {
        if ((++count) % 1000 == 0) {
            System.out.println("  " + count + " of " + topTerms.length + "...");
        }//from   w ww .  j a  v  a  2  s . co  m
        if (tdf.term.length < 5) {
            continue;
        }
        // TODO: make my own fuzzy enum?
        long sumDF = 0;
        SuggestWord[] suggested = spellChecker.suggestSimilar(new Term(field, tdf.term), 50, r,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggested.length < 5) {
            continue;
        }
        for (SuggestWord suggest : suggested) {
            sumDF += suggest.freq;
        }

        // Strongly favor higher number of suggestions and gently favor higher sumDF:
        final long score = (long) (Math.log(sumDF) * suggested.length);

        final TermFreq newTF = new TermFreq(tdf.term, score);
        final TermFreq bumpedTF = pq.insertWithOverflow(newTF);

        if (bumpedTF != newTF) {
            System.out.println(
                    "  " + newTF.term.utf8ToString() + " score=" + score + " suggestCount=" + suggested.length);
        }
    }

    if (pq.size() < NUM_QUERIES) {
        throw new RuntimeException("index is too small: only " + pq.size() + " top fuzzy terms");
    }

    int downTo = NUM_QUERIES;
    while (pq.size() > 0) {
        TermFreq tdf = pq.pop();
        System.out.println("  " + tdf.term.utf8ToString() + " freq=" + tdf.df);
        queriesOut.write("Fuzzy1: " + tdf.term.utf8ToString() + "~1\n");
        queriesOut.write("Fuzzy2: " + tdf.term.utf8ToString() + "~2\n");
        queriesOut.write("Respell: " + tdf.term.utf8ToString() + "\n");
    }
    queriesOut.flush();
}