Example usage for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer

List of usage examples for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.core KeywordTokenizer KeywordTokenizer.

Prototype

public KeywordTokenizer() 

Source Link

Usage

From source file:at.itbh.bev.index.AddressLineExactMatchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.HouseIdAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true);
    filter = new EdgeNGramTokenFilter(filter, 1, 4);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.HouseIdExactMatchAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.houseIdExactRemovePattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.replacePattern, "/", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.uniquifyNonWordCharPattern, "$1", true);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.PostalCodeAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new EdgeNGramTokenFilter(filter, 3, 4);
    return new TokenStreamComponents(source, filter);
}

From source file:at.itbh.bev.index.TextAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new KeywordTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.addressLineStemmingPattern, "", true);
    filter = new PatternReplaceFilter(filter, RegexPatternCollection.nonAlphaCharPattern, "", true);
    filter = new PhoneticFilter(filter, new ColognePhonetic(), true);
    filter = new NGramTokenFilter(filter, 2, 6);
    return new TokenStreamComponents(source, filter);
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.WikiURLAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(final String fieldName) {

    final Tokenizer source = new KeywordTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new CharacterFilter(result);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);

    //        result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null);

    return new TokenStreamComponents(source, result);
}

From source file:org.apache.jena.query.text.analyzer.ConfigurableAnalyzer.java

License:Apache License

private Tokenizer getTokenizer(String tokenizerName) {
    switch (tokenizerName) {
    case "KeywordTokenizer":
        return new KeywordTokenizer();
    case "LetterTokenizer":
        return new LetterTokenizer();
    case "StandardTokenizer":
        return new StandardTokenizer();
    case "WhitespaceTokenizer":
        return new WhitespaceTokenizer();
    default://from   w w w. j  ava2  s. c o m
        throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
    }
}

From source file:org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    KeywordTokenizer source = new KeywordTokenizer();
    LowerCaseFilter filter = new LowerCaseFilter(source);
    return new TokenStreamComponents(source, filter);
}

From source file:org.elasticsearch.index.analysis.RSLPTokenFilterTests.java

License:Apache License

@Test
public void testRSLPRules() throws Exception {
    Index index = new Index("test");
    Settings settings = Settings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("path.home", createTempDir()).put("index.analysis.filter.myStemmer.type", "br_rslp").build();
    AnalysisService analysisService = createAnalysisService(index, settings);

    TokenFilterFactory filterFactory = analysisService.tokenFilter("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();

    Map<String, String> words = buildWordList();

    Set<String> inputWords = words.keySet();
    for (String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();//from w ww  .  ja va2  s  .  co m
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();
    }
}

From source file:org.elasticsearch.index.analysis.SimpleIcuCollationTokenFilterTests.java

License:Apache License

private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison)
        throws IOException {
    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(string1));
    TokenStream stream1 = factory.create(tokenizer);

    tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(string2));
    TokenStream stream2 = factory.create(tokenizer);

    assertCollation(stream1, stream2, comparison);
}