Example usage for org.apache.lucene.search.spell DirectSpellChecker suggestSimilar

List of usage examples for org.apache.lucene.search.spell DirectSpellChecker suggestSimilar

Introduction

In this page you can find the example usage for org.apache.lucene.search.spell DirectSpellChecker suggestSimilar.

Prototype

public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode)
        throws IOException 

Source Link

Document

Calls #suggestSimilar(Term,int,IndexReader,SuggestMode,float) suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)

Usage

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;/*from w  w w  .  ja v a  2 s  .  co  m*/
    try {
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;//from  ww w.java  2  s . c om
    try {
        tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Closeables.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:org.codelibs.elasticsearch.search.suggest.term.TermSuggester.java

License:Apache License

@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher,
        CharsRefBuilder spare) throws IOException {
    DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings()
            .createDirectSpellChecker();
    final IndexReader indexReader = searcher.getIndexReader();
    TermSuggestion response = new TermSuggestion(name, suggestion.getSize(),
            suggestion.getDirectSpellCheckerSettings().sort());
    List<Token> tokens = queryTerms(suggestion, spare);
    for (Token token : tokens) {
        // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef
        SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(token.term, suggestion.getShardSize(),
                indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode());
        Text key = new Text(new BytesArray(token.term.bytes()));
        TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset,
                token.endOffset - token.startOffset);
        for (SuggestWord suggestWord : suggestedWords) {
            Text word = new Text(suggestWord.string);
            resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score));
        }/*from  www  . ja  va  2  s.  c o  m*/
        response.addTerm(resultEntry);
    }
    return response;
}

From source file:org.elasticsearch.search.suggest.term.TermSuggester.java

License:Apache License

@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexReader indexReader,
        CharsRef spare) throws IOException {
    DirectSpellChecker directSpellChecker = SuggestUtils
            .getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings());

    TermSuggestion response = new TermSuggestion(name, suggestion.getSize(),
            suggestion.getDirectSpellCheckerSettings().sort());
    List<Token> tokens = queryTerms(suggestion, spare);
    for (Token token : tokens) {
        // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef
        SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(token.term, suggestion.getShardSize(),
                indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode());
        Text key = new BytesText(new BytesArray(token.term.bytes()));
        TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset,
                token.endOffset - token.startOffset);
        for (SuggestWord suggestWord : suggestedWords) {
            Text word = new StringText(suggestWord.string);
            resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score));
        }//from   w  ww  .j  av a  2s.  com
        response.addTerm(resultEntry);
    }
    return response;
}

From source file:perf.CreateQueries.java

License:Apache License

private static void makeFuzzyAndRespellQueries(IndexReader r, String field, TermFreq[] topTerms,
        Writer queriesOut) throws IOException {

    System.out.println("\nFind top fuzzy/respell terms...");
    final DirectSpellChecker spellChecker = new DirectSpellChecker();
    spellChecker.setThresholdFrequency(1.0f);

    final MostFrequentTerms pq = new MostFrequentTerms(NUM_QUERIES);

    // TODO: use threads...?
    int count = 0;
    for (TermFreq tdf : topTerms) {
        if ((++count) % 1000 == 0) {
            System.out.println("  " + count + " of " + topTerms.length + "...");
        }// w ww.j a v  a  2 s.  co m
        if (tdf.term.length < 5) {
            continue;
        }
        // TODO: make my own fuzzy enum?
        long sumDF = 0;
        SuggestWord[] suggested = spellChecker.suggestSimilar(new Term(field, tdf.term), 50, r,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggested.length < 5) {
            continue;
        }
        for (SuggestWord suggest : suggested) {
            sumDF += suggest.freq;
        }

        // Strongly favor higher number of suggestions and gently favor higher sumDF:
        final long score = (long) (Math.log(sumDF) * suggested.length);

        final TermFreq newTF = new TermFreq(tdf.term, score);
        final TermFreq bumpedTF = pq.insertWithOverflow(newTF);

        if (bumpedTF != newTF) {
            System.out.println(
                    "  " + newTF.term.utf8ToString() + " score=" + score + " suggestCount=" + suggested.length);
        }
    }

    if (pq.size() < NUM_QUERIES) {
        throw new RuntimeException("index is too small: only " + pq.size() + " top fuzzy terms");
    }

    int downTo = NUM_QUERIES;
    while (pq.size() > 0) {
        TermFreq tdf = pq.pop();
        System.out.println("  " + tdf.term.utf8ToString() + " freq=" + tdf.df);
        queriesOut.write("Fuzzy1: " + tdf.term.utf8ToString() + "~1\n");
        queriesOut.write("Fuzzy2: " + tdf.term.utf8ToString() + "~2\n");
        queriesOut.write("Respell: " + tdf.term.utf8ToString() + "\n");
    }
    queriesOut.flush();
}