Example usage for org.apache.lucene.analysis.icu ICUFoldingFilter ICUFoldingFilter

List of usage examples for org.apache.lucene.analysis.icu ICUFoldingFilter ICUFoldingFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.icu ICUFoldingFilter ICUFoldingFilter.

Prototype

public ICUFoldingFilter(TokenStream input) 

Source Link

Document

Create a new ICUFoldingFilter on the specified input

Usage

From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String field, Reader reader) {
    TokenStream stream = new ICUTokenizer(reader);
    return new ICUFoldingFilter(stream);
}

From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java

License:Apache License

@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    TokenStream stream = new ICUTokenizer(reader);
    return new ICUFoldingFilter(stream);
}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    // TokenStream stream = stdAnalyzer.tokenStream(fieldName, reader);
    // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader);
    // stream = new PorterStemFilter(stream);
    // stream = new POSFilter(stream, new String[]{"NN.*"});
    // return stream;

    //      ICUTokenizer icut = new ICUTokenizer(reader);
    //      TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    //      stream = new StopFilter(Version.LUCENE_31, stream,
    //            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new PorterStemFilter(stream);
    //      return new ICUFoldingFilter(stream);

    TokenStream stream = new ICUTokenizer(reader);
    stream = new DictionaryFilter(stream);
    stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
    stream = new PorterStemFilter(stream);
    stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    return new ICUFoldingFilter(stream);

    //      TokenStream stream = engAnalyzer.tokenStream(fieldName, reader);
    //      stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    ////      stream = new StopFilter(Version.LUCENE_31, stream,
    ////            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new EnglishPossessiveFilter(stream);
    //      stream = new PorterStemFilter(stream);
    //      return stream;
}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java

License:Apache License

@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    // TokenStream stream = stdAnalyzer.reusableTokenStream(fieldName,
    // reader);/*w  w  w.jav a  2  s .  co m*/
    // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader);
    // stream = new PorterStemFilter(stream);
    // stream = new POSFilter(stream, new String[]{"NN.*"});
    // return stream;

    //      ICUTokenizer icut = new ICUTokenizer(reader);
    //      TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    //      stream = new StopFilter(Version.LUCENE_31, stream,
    //            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new PorterStemFilter(stream);
    //      return new ICUFoldingFilter(stream);

    TokenStream stream = new ICUTokenizer(reader);
    stream = new DictionaryFilter(stream);
    stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
    stream = new PorterStemFilter(stream);
    stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    return new ICUFoldingFilter(stream);

    //      TokenStream stream = engAnalyzer.tokenStream(fieldName, reader);
    //      stream = new LowerCaseFilter(Version.LUCENE_31, stream);
    //      stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"});
    ////      stream = new StopFilter(Version.LUCENE_31, stream,
    ////            StopAnalyzer.ENGLISH_STOP_WORDS_SET, false);
    //      stream = new EnglishPossessiveFilter(stream);
    //      stream = new PorterStemFilter(stream);
    //      return stream;
}

From source file:edu.ur.lucene.analysis.StandardWithACIIFoldingFilter.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    src.setMaxTokenLength(maxTokenLength);
    src.setReplaceInvalidAcronym(replaceInvalidAcronym);
    TokenStream tok = new StandardFilter(matchVersion, src);
    tok = new LowerCaseFilter(matchVersion, tok);
    tok = new StopFilter(matchVersion, tok, stopwords);
    tok = new ICUFoldingFilter(tok);
    return new TokenStreamComponents(src, tok) {
        @Override/*from  w  w w.ja  va2s. c om*/
        protected boolean reset(final Reader reader) throws IOException {
            src.setMaxTokenLength(StandardWithACIIFoldingFilter.this.maxTokenLength);
            return super.reset(reader);
        }
    };
}

From source file:io.aos.elasticsearch.analysis.icu.IcuFoldingTokenFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream tokenStream) {

    // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
    // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
    if (unicodeSetFilter != null) {
        Normalizer2 base = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
                "utr30", Normalizer2.Mode.COMPOSE);
        UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);

        unicodeSet.freeze();//from   w ww .  j av a 2  s .c  o m
        Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
        return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
    } else {
        return new ICUFoldingFilter(tokenStream);
    }
}

From source file:org.apache.solr.analysis.ICUFoldingFilterFactory.java

License:Apache License

@Override
public TokenStream create(TokenStream input) {
    return new ICUFoldingFilter(input);
}

From source file:org.elasticsearch.indices.analysis.IcuIndicesAnalysis.java

License:Apache License

@Inject
public IcuIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
    super(settings);

    indicesAnalysisService.tokenizerFactories().put("icu_tokenizer",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override//w  ww  .j ava 2  s . c  o m
                public String name() {
                    return "icu_tokenizer";
                }

                @Override
                public Tokenizer create() {
                    return new ICUTokenizer();
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("icu_normalizer",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "icu_normalizer";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream,
                            Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("icu_folding",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "icu_folding";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new ICUFoldingFilter(tokenStream);
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("icu_collation",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "icu_collation";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new ICUCollationKeyFilter(tokenStream, Collator.getInstance());
                }
            }));

    indicesAnalysisService.tokenFilterFactories().put("icu_transform",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "icu_transform";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return new ICUTransformFilter(tokenStream,
                            Transliterator.getInstance("Null", Transliterator.FORWARD));
                }
            }));

    indicesAnalysisService.charFilterFactories().put("icu_normalizer",
            new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() {
                @Override
                public String name() {
                    return "icu_normalizer";
                }

                @Override
                public Reader create(Reader reader) {
                    return new ICUNormalizer2CharFilter(reader);
                }
            }));
}

From source file:org.xbib.elasticsearch.index.analysis.icu.IcuFoldingTokenFilterFactory.java

License:Open Source License

@Override
public TokenStream create(TokenStream tokenStream) {
    if (unicodeSetFilter != null) {
        // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter.
        // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
        Normalizer2 base = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
                "utr30", Normalizer2.Mode.COMPOSE);
        UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
        unicodeSet.freeze();/*from w w w  .  ja v  a  2 s  .  com*/
        Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
        return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
    } else {
        return new ICUFoldingFilter(tokenStream);
    }
}