Example usage for org.apache.lucene.analysis.icu.segmentation ICUTokenizer ICUTokenizer

List of usage examples for org.apache.lucene.analysis.icu.segmentation ICUTokenizer ICUTokenizer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.icu.segmentation ICUTokenizer ICUTokenizer.

Prototype

public ICUTokenizer(ICUTokenizerConfig config) 

Source Link

Document

Construct a new ICUTokenizer that breaks text into words from the given Reader, using a tailored BreakIterator configuration.

Usage

From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String field, Reader reader) {
    TokenStream stream = new ICUTokenizer(reader);
    return new ICUFoldingFilter(stream);
}

From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java

License:Apache License

@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    TokenStream stream = new ICUTokenizer(reader);
    return new ICUFoldingFilter(stream);
}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    // TokenStream stream = stdAnalyzer.tokenStream(fieldName, reader);
    // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader);
    // stream = new PorterStemFilter(stream);
    // stream = new POSFilter(stream, new String[]{"NN.*"});
    // return stream;

    //      ICUTokenizer icut = new ICUTokenizer(reader);
    //      TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    //      stream = new StopFilter(Version.LUCENE_31, stream,
    //            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new PorterStemFilter(stream);
    //      return new ICUFoldingFilter(stream);

    TokenStream stream = new ICUTokenizer(reader);
    stream = new DictionaryFilter(stream);
    stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
    stream = new PorterStemFilter(stream);
    stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    return new ICUFoldingFilter(stream);

    //      TokenStream stream = engAnalyzer.tokenStream(fieldName, reader);
    //      stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    ////      stream = new StopFilter(Version.LUCENE_31, stream,
    ////            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new EnglishPossessiveFilter(stream);
    //      stream = new PorterStemFilter(stream);
    //      return stream;
}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java

License:Apache License

@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    // TokenStream stream = stdAnalyzer.reusableTokenStream(fieldName,
    // reader);//from w  ww. ja  v  a2 s .c om
    // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader);
    // stream = new PorterStemFilter(stream);
    // stream = new POSFilter(stream, new String[]{"NN.*"});
    // return stream;

    //      ICUTokenizer icut = new ICUTokenizer(reader);
    //      TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    //      stream = new StopFilter(Version.LUCENE_31, stream,
    //            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new PorterStemFilter(stream);
    //      return new ICUFoldingFilter(stream);

    TokenStream stream = new ICUTokenizer(reader);
    stream = new DictionaryFilter(stream);
    stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
    stream = new PorterStemFilter(stream);
    stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    return new ICUFoldingFilter(stream);

    //      TokenStream stream = engAnalyzer.tokenStream(fieldName, reader);
    //      stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    ////      stream = new StopFilter(Version.LUCENE_31, stream,
    ////            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new EnglishPossessiveFilter(stream);
    //      stream = new PorterStemFilter(stream);
    //      return stream;
}

From source file:org.apache.solr.analysis.ICUTokenizerFactory.java

License:Apache License

@Override
public Tokenizer create(Reader input) {
    return new ICUTokenizer(input);
}

From source file:org.voyanttools.trombone.lucene.analysis.LexicalAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    if (fieldName.equals(TokenType.lexical.name())
            && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) {
        Tokenizer tokenizer = new LowerCaseTokenizer();
        return new TokenStreamComponents(tokenizer);
    } else if (fieldName.equals(TokenType.lexical.name())
            && parameters.getParameterValue("tokenization", "").equals("whitespace")) {
        Tokenizer tokenizer = new UnicodeWhitespaceTokenizer();
        return new TokenStreamComponents(tokenizer);
    } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese
        Tokenizer tokenizer = new HMMChineseTokenizer();
        return new TokenStreamComponents(tokenizer, tokenizer);
    } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan
        Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang));
        TokenStream stream = new LowerCaseFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    } else if (lang.equals("grc") /* Ancient Greek */ || lang.equals("el") /* Modern Greek */) {
        Tokenizer tokenizer = new ICUTokenizer();
        TokenStream stream = new GreekCustomFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    } else { // default case
        Tokenizer tokenizer = new ICUTokenizer();
        TokenStream stream = new LowerCaseFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
    }/*from   w w  w  .  j  av a 2 s  . c  o  m*/
}