List of usage examples for org.apache.lucene.search.spell DirectSpellChecker DirectSpellChecker
public DirectSpellChecker()
From source file:com.codeReading.core.opengrok.SearchHelper.java
License:Open Source License
/** * Create the searcher to use wrt. to currently set parameters and the given * projects. Does not produce any {@link #redirect} link. It also does * nothing if {@link #redirect} or {@link #errorMsg} have a * none-{@code null} value. <p> Parameters which should be populated/set at * this time: <ul> <li>{@link #builder}</li> <li>{@link #dataRoot}</li> * <li>{@link #order} (falls back to relevance if unset)</li> * <li>{@link #parallel} (default: false)</li> </ul> Populates/sets: <ul> * <li>{@link #query}</li> <li>{@link #searcher}</li> <li>{@link #sort}</li> * <li>{@link #projects}</li> <li>{@link #errorMsg} if an error occurs</li> * </ul>/*from ww w . jav a 2 s . c om*/ * * @param projects project to use query. If empty, a none-project opengrok * setup is assumed (i.e. DATA_ROOT/index will be used instead of possible * multiple DATA_ROOT/$project/index). * @return this instance */ public SearchHelper prepareExec(SortedSet<String> projects) { if (redirect != null || errorMsg != null) { return this; } // the Query created by the QueryBuilder try { indexDir = new File(dataRoot, "index"); query = builder.build(); if (projects == null) { errorMsg = "No project selected!"; return this; } this.projects = projects; if (projects.isEmpty()) { //no project setup FSDirectory dir = FSDirectory.open(indexDir); searcher = new IndexSearcher(DirectoryReader.open(dir)); } else if (projects.size() == 1) { // just 1 project selected FSDirectory dir = FSDirectory.open(new File(indexDir, projects.first())); searcher = new IndexSearcher(DirectoryReader.open(dir)); } else { //more projects IndexReader[] subreaders = new IndexReader[projects.size()]; int ii = 0; //TODO might need to rewrite to Project instead of // String , need changes in projects.jspf too for (String proj : projects) { FSDirectory dir = FSDirectory.open(new File(indexDir, proj)); subreaders[ii++] = DirectoryReader.open(dir); } MultiReader searchables = new MultiReader(subreaders, true); if (parallel) { int noThreads = 2 + (2 * Runtime.getRuntime().availableProcessors()); //TODO there might be a better way for counting this executor = Executors.newFixedThreadPool(noThreads); } searcher = parallel ? new IndexSearcher(searchables, executor) : new IndexSearcher(searchables); } // TODO check if below is somehow reusing sessions so we don't // requery again and again, I guess 2min timeout sessions could be // usefull, since you click on the next page within 2mins, if not, // then wait ;) switch (order) { case LASTMODIFIED: sort = new Sort(new SortField("date", SortField.Type.STRING, true)); break; case BY_PATH: sort = new Sort(new SortField("fullpath", SortField.Type.STRING)); break; default: sort = Sort.RELEVANCE; break; } checker = new DirectSpellChecker(); } catch (ParseException e) { errorMsg = PARSE_ERROR_MSG + e.getMessage(); } catch (FileNotFoundException e) { // errorMsg = "Index database(s) not found: " + e.getMessage(); errorMsg = "Index database(s) not found."; } catch (Exception e) { errorMsg = e.getMessage(); } return this; }
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null;// w ww . j ava 2s .c o m try { tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null;//from w w w.j a v a 2 s. c om try { tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:org.codelibs.elasticsearch.search.suggest.DirectSpellcheckerSettings.java
License:Apache License
public DirectSpellChecker createDirectSpellChecker() { DirectSpellChecker directSpellChecker = new DirectSpellChecker(); directSpellChecker.setAccuracy(accuracy()); Comparator<SuggestWord> comparator; switch (sort()) { case SCORE://w w w .j a va 2 s . c o m comparator = SCORE_COMPARATOR; break; case FREQUENCY: comparator = LUCENE_FREQUENCY; break; default: throw new IllegalArgumentException("Illegal suggest sort: " + sort()); } directSpellChecker.setComparator(comparator); directSpellChecker.setDistance(stringDistance()); directSpellChecker.setMaxEdits(maxEdits()); directSpellChecker.setMaxInspections(maxInspections()); directSpellChecker.setMaxQueryFrequency(maxTermFreq()); directSpellChecker.setMinPrefix(prefixLength()); directSpellChecker.setMinQueryLength(minWordLength()); directSpellChecker.setThresholdFrequency(minDocFreq()); directSpellChecker.setLowerCaseTerms(false); return directSpellChecker; }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHeros() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override//from w w w .j a va 2 s.c o m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); Correction[] corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>")); assertThat(result.cutoffScore, greaterThan(0d)); result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame")); assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE)); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the <em>god</em> jewel")); assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor the <em>god</em> jewel")); assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel")); // Test some of the highlighting corner cases suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr the god</em> jewel")); assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor <em>the god</em> jewel")); assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn the god</em> jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xor teh <em>god</em> jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser).parse( new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>")); // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>")); }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosMultiGenerator() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override//ww w.j ava2s . com protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); mapping.put("body_reverse", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new ReverseStringFilter(Version.LUCENE_41, new LowerCaseFilter(Version.LUCENE_41, t))); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10); DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse")); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("varr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosTrigram() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override/*from w w w .ja v a 2s . c o m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections; assertThat(corrections.length, equalTo(0)); // assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape")); wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser).parse( new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.95); wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body")); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); }
From source file:org.elasticsearch.search.suggest.SuggestUtils.java
License:Apache License
public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) { DirectSpellChecker directSpellChecker = new DirectSpellChecker(); directSpellChecker.setAccuracy(suggestion.accuracy()); Comparator<SuggestWord> comparator; switch (suggestion.sort()) { case SCORE:/*w w w .j a va 2 s . c o m*/ comparator = SCORE_COMPARATOR; break; case FREQUENCY: comparator = LUCENE_FREQUENCY; break; default: throw new ElasticsearchIllegalArgumentException("Illegal suggest sort: " + suggestion.sort()); } directSpellChecker.setComparator(comparator); directSpellChecker.setDistance(suggestion.stringDistance()); directSpellChecker.setMaxEdits(suggestion.maxEdits()); directSpellChecker.setMaxInspections(suggestion.maxInspections()); directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq()); directSpellChecker.setMinPrefix(suggestion.prefixLength()); directSpellChecker.setMinQueryLength(suggestion.minWordLength()); directSpellChecker.setThresholdFrequency(suggestion.minDocFreq()); directSpellChecker.setLowerCaseTerms(false); return directSpellChecker; }
From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHeros() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override/*from ww w.ja v a 2 s. c o m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"))); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ame")); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser) .add(new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); }
From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosMultiGenerator() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override// w w w . ja v a2s. c o m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); mapping.put("body_reverse", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new ReverseStringFilter(Version.LUCENE_41, new LowerCaseFilter(Version.LUCENE_41, t))); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"))); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); LaplaceScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10); DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("tarr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); }