List of usage examples for org.apache.lucene.analysis MockTokenizer MockTokenizer
public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase)
From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java
License:Apache License
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override//from ww w .j a va 2 s.c o m protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
From source file:org.elasticsearch.analysis.hunspell.TestStemming.java
License:Apache License
public void test() throws Exception { LineNumberReader reader = new LineNumberReader(IOUtils.getDecodingReader( getClass().getResourceAsStream("/stemming-data/" + language + ".txt"), StandardCharsets.UTF_8)); dictionaryStream = getClass().getResourceAsStream("/" + language + "/" + language + ".dic"); affixStream = getClass().getResourceAsStream("/" + language + "/" + language + ".aff"); final Dictionary dictionary = new Dictionary(affixStream, dictionaryStream); Analyzer analyzer = new Analyzer() { @Override/*from w w w . j av a2s . c om*/ protected TokenStreamComponents createComponents(String field) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, false); return new TokenStreamComponents(tokenizer, filter); } }; String line = null; while ((line = reader.readLine()) != null) { int comment = line.indexOf('#'); if (comment >= 0) { line = line.substring(0, comment); } line = line.trim(); if (line.isEmpty()) { continue; } String elements[] = line.split("\\s+"); if (elements.length != 2) { throw new RuntimeException("Illegal number of elements in line: " + reader.getLineNumber()); } String input = elements[0]; String outputs[] = elements[1].split(","); compareStems(analyzer, input, outputs, reader.getLineNumber()); } analyzer.close(); reader.close(); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testSuggestTokenFilter() throws Exception { Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenStream.setReader(new StringReader("mykeyword")); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() { @Override/*from w w w . j a va2 s . c o m*/ public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(stream); } })); assertTokenStreamContents(suggestTokenStream, new String[] { "mykeyword" }, null, null, new String[] { "Surface keyword|friggin payload|10" }, new int[] { 1 }, null, null); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testSuggestTokenFilterWithSynonym() throws Exception { Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(filter, payload, new CompletionTokenStream.ToFiniteStrings() { @Override//from ww w .ja v a2 s.c o m public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(stream); } })); assertTokenStreamContents(suggestTokenStream, new String[] { "mysynonym", "mykeyword" }, null, null, new String[] { "Surface keyword|friggin payload|10", "Surface keyword|friggin payload|10" }, new int[] { 2, 0 }, null, null); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }//from ww w . j a va 2 s .c o m StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 8; i++) { valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream); return finiteStrings; } }); suggestTokenStream.reset(); ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class); PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class); int maxPos = 0; int count = 0; while (suggestTokenStream.incrementToken()) { count++; assertNotNull(attr.getBytesRef()); assertTrue(attr.getBytesRef().length > 0); maxPos += posAttr.getPositionIncrement(); } suggestTokenStream.close(); assertEquals(count, 256); assertEquals(count, maxPos); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testInValidNumberOfExpansions() throws IOException { Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < 256; i++) { builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true); }// ww w.ja v a2s . c o m StringBuilder valueBuilder = new StringBuilder(); for (int i = 0; i < 9; i++) { // 9 -> expands to 512 valueBuilder.append(i + 1); valueBuilder.append(" "); } MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader(valueBuilder.toString())); SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true); TokenStream suggestTokenStream = new CompletionTokenStream(filter, new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() { @Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream); return finiteStrings; } }); suggestTokenStream.reset(); suggestTokenStream.incrementToken(); suggestTokenStream.close(); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(tokenizer, payload, new CompletionTokenStream.ToFiniteStrings() { @Override//from ww w.java 2 s . c o m public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(stream); } })); TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(termAtt.getBytesRef()); suggestTokenStream.reset(); while (suggestTokenStream.incrementToken()) { assertThat(termAtt.getBytesRef().utf8ToString(), equalTo("mykeyword")); } suggestTokenStream.end(); suggestTokenStream.close(); }
From source file:org.elasticsearch.test.MockKeywordPlugin.java
License:Apache License
@Override public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() { return singletonMap("keyword", (indexSettings, environment, name, settings) -> { class Factory implements TokenizerFactory { @Override/*from w w w . j av a 2 s .co m*/ public Tokenizer create() { return new MockTokenizer(MockTokenizer.KEYWORD, false); } } return new Factory(); }); }
From source file:org.owasp.dependencycheck.data.lucene.AlphaNumericFilterTest.java
License:Apache License
public AlphaNumericFilterTest() { analyzer = new Analyzer() { @Override//from ww w .jav a 2 s .c om protected Analyzer.TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new Analyzer.TokenStreamComponents(source, new AlphaNumericFilter(source)); } }; }
From source file:org.tallison.lucene.queryparser.spans.TestAdvancedAnalyzers.java
License:Apache License
@BeforeClass public static void beforeClass() throws Exception { lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true); Map<String, String> attrs = new HashMap<>(); attrs.put("generateWordParts", "1"); attrs.put("generateNumberParts", "1"); attrs.put("catenateWords", "1"); attrs.put("catenateNumbers", "1"); attrs.put("catenateAll", "1"); attrs.put("splitOnCaseChange", "1"); attrs.put("preserveOriginal", "1"); complexAnalyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(TestAdvancedAnalyzers.class)) .withTokenizer("whitespace").addTokenFilter("worddelimiter", attrs).addTokenFilter("kstem") .addTokenFilter("removeduplicates").build(); synAnalyzer = new Analyzer() { @Override/*w w w.ja v a2 s .c o m*/ public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); filter = new MockSynFilter(filter); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockNonWhitespaceFilter(new MockSynFilter(in)); } }; baseAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockNonWhitespaceFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockNonWhitespaceFilter(new LowerCaseFilter(in)); } }; ucVowelAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } }; ucVowelMTAnalyzer = new Analyzer() { @Override public TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } }; Analyzer tmpUCVowelAnalyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); TokenFilter filter = new MockUCVowelFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); } @Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockUCVowelFilter(new LowerCaseFilter(in)); } }; directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(baseAnalyzer) .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy())); String[] docs = new String[] { "abc_def", "lmnop", "abc one", "abc two", "qrs one", "qrs two", "tuv one", "tuv two", "qrs tuv", "qrs_tuv" }; for (int i = 0; i < docs.length; i++) { Document doc = new Document(); doc.add(newTextField(FIELD1, docs[i], Field.Store.YES)); TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES); tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i])); doc.add(tf); doc.add(newTextField(FIELD3, docs[i], Field.Store.YES)); TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES); tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i])); doc.add(tf4); writer.addDocument(doc); } reader = writer.getReader(); searcher = newSearcher(reader); writer.close(); }