Example usage for org.apache.lucene.analysis MockTokenizer MockTokenizer

List of usage examples for org.apache.lucene.analysis MockTokenizer MockTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis MockTokenizer MockTokenizer.

Prototype

public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) 

Source Link

Usage

From source file:org.elasticsearch.analysis.common.UniqueTokenFilterTests.java

License:Apache License

public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override//from  ww  w .j  a va  2  s.c  o m
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
}

From source file:org.elasticsearch.analysis.hunspell.TestStemming.java

License:Apache License

public void test() throws Exception {
    LineNumberReader reader = new LineNumberReader(IOUtils.getDecodingReader(
            getClass().getResourceAsStream("/stemming-data/" + language + ".txt"), StandardCharsets.UTF_8));
    dictionaryStream = getClass().getResourceAsStream("/" + language + "/" + language + ".dic");
    affixStream = getClass().getResourceAsStream("/" + language + "/" + language + ".aff");
    final Dictionary dictionary = new Dictionary(affixStream, dictionaryStream);
    Analyzer analyzer = new Analyzer() {
        @Override/*from w  w w  .  j av  a2s .  c  om*/
        protected TokenStreamComponents createComponents(String field) {
            MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
            HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, false);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    String line = null;
    while ((line = reader.readLine()) != null) {
        int comment = line.indexOf('#');
        if (comment >= 0) {
            line = line.substring(0, comment);
        }
        line = line.trim();
        if (line.isEmpty()) {
            continue;
        }
        String elements[] = line.split("\\s+");
        if (elements.length != 2) {
            throw new RuntimeException("Illegal number of elements in line: " + reader.getLineNumber());
        }
        String input = elements[0];
        String outputs[] = elements[1].split(",");
        compareStems(analyzer, input, outputs, reader.getLineNumber());
    }
    analyzer.close();
    reader.close();
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test
public void testSuggestTokenFilter() throws Exception {
    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenStream.setReader(new StringReader("mykeyword"));
    BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
    TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(
            new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() {
                @Override/*from w w w . j a  va2 s . c o m*/
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    return suggester.toFiniteStrings(stream);
                }
            }));
    assertTokenStreamContents(suggestTokenStream, new String[] { "mykeyword" }, null, null,
            new String[] { "Surface keyword|friggin payload|10" }, new int[] { 1 }, null, null);
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test
public void testSuggestTokenFilterWithSynonym() throws Exception {
    Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);

    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader("mykeyword"));
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

    BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
    TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(
            new CompletionTokenStream(filter, payload, new CompletionTokenStream.ToFiniteStrings() {
                @Override//from ww w .ja v  a2 s.c o m
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    return suggester.toFiniteStrings(stream);
                }
            }));
    assertTokenStreamContents(suggestTokenStream, new String[] { "mysynonym", "mykeyword" }, null, null,
            new String[] { "Surface keyword|friggin payload|10", "Surface keyword|friggin payload|10" },
            new int[] { 2, 0 }, null, null);
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test
public void testValidNumberOfExpansions() throws IOException {
    Builder builder = new SynonymMap.Builder(true);
    for (int i = 0; i < 256; i++) {
        builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
    }//from   ww  w  . j a va  2 s  .c  o  m
    StringBuilder valueBuilder = new StringBuilder();
    for (int i = 0; i < 8; i++) {
        valueBuilder.append(i + 1);
        valueBuilder.append(" ");
    }
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader(valueBuilder.toString()));
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

    TokenStream suggestTokenStream = new CompletionTokenStream(filter,
            new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() {
                @Override
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream);
                    return finiteStrings;
                }
            });

    suggestTokenStream.reset();
    ByteTermAttribute attr = suggestTokenStream.addAttribute(ByteTermAttribute.class);
    PositionIncrementAttribute posAttr = suggestTokenStream.addAttribute(PositionIncrementAttribute.class);
    int maxPos = 0;
    int count = 0;
    while (suggestTokenStream.incrementToken()) {
        count++;
        assertNotNull(attr.getBytesRef());
        assertTrue(attr.getBytesRef().length > 0);
        maxPos += posAttr.getPositionIncrement();
    }
    suggestTokenStream.close();
    assertEquals(count, 256);
    assertEquals(count, maxPos);

}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test(expected = IllegalArgumentException.class)
public void testInValidNumberOfExpansions() throws IOException {
    Builder builder = new SynonymMap.Builder(true);
    for (int i = 0; i < 256; i++) {
        builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
    }// ww w.ja  v a2s .  c  o m
    StringBuilder valueBuilder = new StringBuilder();
    for (int i = 0; i < 9; i++) { // 9 -> expands to 512
        valueBuilder.append(i + 1);
        valueBuilder.append(" ");
    }
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader(valueBuilder.toString()));
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

    TokenStream suggestTokenStream = new CompletionTokenStream(filter,
            new BytesRef("Surface keyword|friggin payload|10"), new CompletionTokenStream.ToFiniteStrings() {
                @Override
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    Set<IntsRef> finiteStrings = suggester.toFiniteStrings(stream);
                    return finiteStrings;
                }
            });

    suggestTokenStream.reset();
    suggestTokenStream.incrementToken();
    suggestTokenStream.close();

}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test
public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception {
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader("mykeyword"));
    BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
    TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(
            new CompletionTokenStream(tokenizer, payload, new CompletionTokenStream.ToFiniteStrings() {
                @Override//from ww w.java  2  s .  c o m
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    return suggester.toFiniteStrings(stream);
                }
            }));
    TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class);
    assertNotNull(termAtt.getBytesRef());
    suggestTokenStream.reset();

    while (suggestTokenStream.incrementToken()) {
        assertThat(termAtt.getBytesRef().utf8ToString(), equalTo("mykeyword"));
    }
    suggestTokenStream.end();
    suggestTokenStream.close();
}

From source file:org.elasticsearch.test.MockKeywordPlugin.java

License:Apache License

@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
    return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
        class Factory implements TokenizerFactory {

            @Override/*from  w w w .  j av  a 2  s .co m*/
            public Tokenizer create() {
                return new MockTokenizer(MockTokenizer.KEYWORD, false);
            }
        }
        return new Factory();
    });
}

From source file:org.owasp.dependencycheck.data.lucene.AlphaNumericFilterTest.java

License:Apache License

public AlphaNumericFilterTest() {
    analyzer = new Analyzer() {
        @Override//from  ww  w .jav a  2  s .c  om
        protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new Analyzer.TokenStreamComponents(source, new AlphaNumericFilter(source));
        }
    };
}

From source file:org.tallison.lucene.queryparser.spans.TestAdvancedAnalyzers.java

License:Apache License

@BeforeClass
public static void beforeClass() throws Exception {
    lcMultiTermAnalyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, true);

    Map<String, String> attrs = new HashMap<>();
    attrs.put("generateWordParts", "1");
    attrs.put("generateNumberParts", "1");
    attrs.put("catenateWords", "1");
    attrs.put("catenateNumbers", "1");
    attrs.put("catenateAll", "1");
    attrs.put("splitOnCaseChange", "1");
    attrs.put("preserveOriginal", "1");
    complexAnalyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(TestAdvancedAnalyzers.class))
            .withTokenizer("whitespace").addTokenFilter("worddelimiter", attrs).addTokenFilter("kstem")
            .addTokenFilter("removeduplicates").build();

    synAnalyzer = new Analyzer() {
        @Override/*w w  w.ja  v  a2  s .c  o m*/
        public TokenStreamComponents createComponents(String fieldName) {

            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);

            filter = new MockSynFilter(filter);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockNonWhitespaceFilter(new MockSynFilter(in));
        }

    };

    baseAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockNonWhitespaceFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockNonWhitespaceFilter(new LowerCaseFilter(in));
        }

    };

    ucVowelAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }
    };

    ucVowelMTAnalyzer = new Analyzer() {
        @Override
        public TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };

    Analyzer tmpUCVowelAnalyzer = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            TokenFilter filter = new MockUCVowelFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }

        @Override
        protected TokenStream normalize(String fieldName, TokenStream in) {
            return new MockUCVowelFilter(new LowerCaseFilter(in));
        }
    };
    directory = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(baseAnalyzer)
            .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)).setMergePolicy(newLogMergePolicy()));
    String[] docs = new String[] { "abc_def", "lmnop", "abc one", "abc two", "qrs one", "qrs two", "tuv one",
            "tuv two", "qrs tuv", "qrs_tuv" };
    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newTextField(FIELD1, docs[i], Field.Store.YES));
        TextField tf = new TextField(FIELD2, docs[i], Field.Store.YES);
        tf.setTokenStream(ucVowelAnalyzer.tokenStream(FIELD2, docs[i]));
        doc.add(tf);
        doc.add(newTextField(FIELD3, docs[i], Field.Store.YES));

        TextField tf4 = new TextField(FIELD4, docs[i], Field.Store.YES);
        tf4.setTokenStream(tmpUCVowelAnalyzer.tokenStream(FIELD4, docs[i]));
        doc.add(tf4);
        writer.addDocument(doc);
    }
    reader = writer.getReader();
    searcher = newSearcher(reader);
    writer.close();
}