List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter setOutputUnigrams
public void setOutputUnigrams(boolean outputUnigrams)
From source file:com.NGramTokenBaseAnalyzer.java
public static ShingleFilter filter(TokenStream tok, boolean unigram) { ShingleFilter sf = new ShingleFilter(tok, NGramTokenBaseAnalyzer.min, NGramTokenBaseAnalyzer.max); sf.setOutputUnigrams(unigram); return sf;/* w w w . j av a 2 s . c o m*/ }
From source file:edu.isi.pfindr.learn.model.Shingles.java
License:Apache License
public static List<String> computeShingles(String data) { //System.out.println("I an here"); data = data.toLowerCase();// www . ja v a 2s . co m List<String> shingleList = new ArrayList<String>(); //System.out.println("DATA inside expandWithDictionaryForShingles "+ data); try { Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); ShingleFilter filter = new ShingleFilter(tokenStream, 4); filter.setOutputUnigrams(false); TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); //System.out.print("Printing the shingles "); while (filter.incrementToken()) { shingleList.add(termAtt.term().trim()); //.replaceAll("_", " ").replaceAll("\\s+", " ").trim()); //System.out.print(termAtt.term()+ "\t"); } } catch (Exception e) { e.printStackTrace(); } logger.info("Shingle List size returned: " + shingleList.size()); return shingleList; }
From source file:org.apache.nutch.scoring.similarity.util.LuceneTokenizer.java
License:Apache License
private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(content)); tokenStream = new LowerCaseFilter(tokenizer); tokenStream = applyStemmer(stemFilterType); ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); shingleFilter.setOutputUnigrams(false); tokenStream = (TokenStream) shingleFilter; return tokenStream; }
From source file:org.apache.solr.analysis.ShingleFilterFactory.java
License:Apache License
public ShingleFilter create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.setOutputUnigrams(outputUnigrams); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); return r;//from w w w . j av a 2 s . c om }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHeros() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override/* w w w.j a v a 2 s . co m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); Correction[] corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>")); assertThat(result.cutoffScore, greaterThan(0d)); result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); corrections = result.corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame")); assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE)); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the <em>god</em> jewel")); assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor the <em>god</em> jewel")); assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel")); // Test some of the highlighting corner cases suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr the god</em> jewel")); assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor <em>the god</em> jewel")); assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn the god</em> jewel")); assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xor teh <em>god</em> jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser).parse( new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>")); // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body")); corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>")); }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosMultiGenerator() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override//from ww w . jav a 2 s.co m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); mapping.put("body_reverse", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new ReverseStringFilter(Version.LUCENE_41, new LowerCaseFilter(Version.LUCENE_41, t))); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10); DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse")); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("varr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); }
From source file:org.elasticsearch.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosTrigram() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override//w w w . j av a 2 s. co m protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"), Charsets.UTF_8)); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections; assertThat(corrections.length, equalTo(0)); // assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape")); wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections; assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser).parse( new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.95); wordScorer = new LinearInterpoatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body")); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections; assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections; assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); }
From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHeros() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override/*from w w w . j a v a 2s. co m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"))); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ame")); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser) .add(new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.85); wordScorer = new LaplaceScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5f); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); }
From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosMultiGenerator() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override/*from w ww.ja va 2s. co m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); mapping.put("body_reverse", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new ReverseStringFilter(Version.LUCENE_41, new LowerCaseFilter(Version.LUCENE_41, t))); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"))); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); LaplaceScorer wordScorer = new LaplaceScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5f); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10); DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper); CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); generator = new MultiCandidateGeneratorWrapper(5, forward, reverse); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("tarr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); }
From source file:org.elasticsearch.test.unit.search.suggest.phrase.NoisyChannelSpellCheckerTests.java
License:Apache License
@Test public void testMarvelHerosTrigram() throws IOException { RAMDirectory dir = new RAMDirectory(); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put("body_ngram", new Analyzer() { @Override/*from w w w. j av a2 s .co m*/ protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); ShingleFilter tf = new ShingleFilter(t, 2, 3); tf.setOutputUnigrams(false); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, tf)); } }); mapping.put("body", new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); return new TokenStreamComponents(t, new LowerCaseFilter(Version.LUCENE_41, t)); } }); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_41), mapping); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_41, wrapper); IndexWriter writer = new IndexWriter(dir, conf); BufferedReader reader = new BufferedReader(new InputStreamReader( NoisyChannelSpellCheckerTests.class.getResourceAsStream("/config/names.txt"))); String line = null; while ((line = reader.readLine()) != null) { Document doc = new Document(); doc.add(new Field("body", line, TextField.TYPE_NOT_STORED)); doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED)); writer.addDocument(doc); } DirectoryReader ir = DirectoryReader.open(writer, false); WordScorer wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker(); DirectSpellChecker spellchecker = new DirectSpellChecker(); spellchecker.setMinQueryLength(1); DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5); Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace")); corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1); assertThat(corrections.length, equalTo(0)); // assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape")); wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3); assertThat(corrections.length, equalTo(4)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel")); assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel")); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3); assertThat(corrections.length, equalTo(1)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); // test synonyms Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(Version.LUCENE_41, reader); TokenFilter filter = new LowerCaseFilter(Version.LUCENE_41, t); try { SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer(Version.LUCENE_41)); ((SolrSynonymParser) parser) .add(new StringReader("usa => usa, america, american\nursa => usa, america, american")); filter = new SynonymFilter(filter, parser.build(), true); } catch (Exception e) { throw new RuntimeException(e); } return new TokenStreamComponents(t, filter); } }; spellchecker.setAccuracy(0.0f); spellchecker.setMinPrefix(1); spellchecker.setMinQueryLength(1); suggester = new NoisyChannelSpellChecker(0.95); wordScorer = new LinearInterpoatingScorer(ir, "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer); corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america")); wordScorer = new StupidBackoffScorer(ir, "body_ngram", 0.85d, new BytesRef(" "), 0.4); corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3); assertThat(corrections.length, equalTo(2)); assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel")); assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel")); }