List of usage examples for org.apache.lucene.search.spell DirectSpellChecker suggestSimilar
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode) throws IOException
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null;/*from w w w . ja v a 2 s . co m*/ try { tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null;//from ww w.java 2 s . c om try { tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:org.codelibs.elasticsearch.search.suggest.term.TermSuggester.java
License:Apache License
@Override public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings() .createDirectSpellChecker(); final IndexReader indexReader = searcher.getIndexReader(); TermSuggestion response = new TermSuggestion(name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()); List<Token> tokens = queryTerms(suggestion, spare); for (Token token : tokens) { // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode()); Text key = new Text(new BytesArray(token.term.bytes())); TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset); for (SuggestWord suggestWord : suggestedWords) { Text word = new Text(suggestWord.string); resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score)); }/*from www . ja va 2 s. c o m*/ response.addTerm(resultEntry); } return response; }
From source file:org.elasticsearch.search.suggest.term.TermSuggester.java
License:Apache License
@Override public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexReader indexReader, CharsRef spare) throws IOException { DirectSpellChecker directSpellChecker = SuggestUtils .getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings()); TermSuggestion response = new TermSuggestion(name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()); List<Token> tokens = queryTerms(suggestion, spare); for (Token token : tokens) { // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode()); Text key = new BytesText(new BytesArray(token.term.bytes())); TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset); for (SuggestWord suggestWord : suggestedWords) { Text word = new StringText(suggestWord.string); resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score)); }//from w ww .j av a 2s. com response.addTerm(resultEntry); } return response; }
From source file:perf.CreateQueries.java
License:Apache License
private static void makeFuzzyAndRespellQueries(IndexReader r, String field, TermFreq[] topTerms, Writer queriesOut) throws IOException { System.out.println("\nFind top fuzzy/respell terms..."); final DirectSpellChecker spellChecker = new DirectSpellChecker(); spellChecker.setThresholdFrequency(1.0f); final MostFrequentTerms pq = new MostFrequentTerms(NUM_QUERIES); // TODO: use threads...? int count = 0; for (TermFreq tdf : topTerms) { if ((++count) % 1000 == 0) { System.out.println(" " + count + " of " + topTerms.length + "..."); }// w ww.j a v a 2 s. co m if (tdf.term.length < 5) { continue; } // TODO: make my own fuzzy enum? long sumDF = 0; SuggestWord[] suggested = spellChecker.suggestSimilar(new Term(field, tdf.term), 50, r, SuggestMode.SUGGEST_MORE_POPULAR); if (suggested.length < 5) { continue; } for (SuggestWord suggest : suggested) { sumDF += suggest.freq; } // Strongly favor higher number of suggestions and gently favor higher sumDF: final long score = (long) (Math.log(sumDF) * suggested.length); final TermFreq newTF = new TermFreq(tdf.term, score); final TermFreq bumpedTF = pq.insertWithOverflow(newTF); if (bumpedTF != newTF) { System.out.println( " " + newTF.term.utf8ToString() + " score=" + score + " suggestCount=" + suggested.length); } } if (pq.size() < NUM_QUERIES) { throw new RuntimeException("index is too small: only " + pq.size() + " top fuzzy terms"); } int downTo = NUM_QUERIES; while (pq.size() > 0) { TermFreq tdf = pq.pop(); System.out.println(" " + tdf.term.utf8ToString() + " freq=" + tdf.df); queriesOut.write("Fuzzy1: " + tdf.term.utf8ToString() + "~1\n"); queriesOut.write("Fuzzy2: " + tdf.term.utf8ToString() + "~2\n"); queriesOut.write("Respell: " + tdf.term.utf8ToString() + "\n"); } queriesOut.flush(); }