List of usage examples for org.apache.lucene.analysis.icu ICUFoldingFilter ICUFoldingFilter
public ICUFoldingFilter(TokenStream input)
From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String field, Reader reader) { TokenStream stream = new ICUTokenizer(reader); return new ICUFoldingFilter(stream); }
From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java
License:Apache License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { TokenStream stream = new ICUTokenizer(reader); return new ICUFoldingFilter(stream); }
From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String fieldName, Reader reader) { // TokenStream stream = stdAnalyzer.tokenStream(fieldName, reader); // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader); // stream = new PorterStemFilter(stream); // stream = new POSFilter(stream, new String[]{"NN.*"}); // return stream; // ICUTokenizer icut = new ICUTokenizer(reader); // TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); // stream = new StopFilter(Version.LUCENE_31, stream, // StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new PorterStemFilter(stream); // return new ICUFoldingFilter(stream); TokenStream stream = new ICUTokenizer(reader); stream = new DictionaryFilter(stream); stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); stream = new PorterStemFilter(stream); stream = new LowerCaseFilter(Version.LUCENE_31, stream); return new ICUFoldingFilter(stream); // TokenStream stream = engAnalyzer.tokenStream(fieldName, reader); // stream = new LowerCaseFilter(Version.LUCENE_31, stream); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); //// stream = new StopFilter(Version.LUCENE_31, stream, //// StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new EnglishPossessiveFilter(stream); // stream = new PorterStemFilter(stream); // return stream; }
From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java
License:Apache License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { // TokenStream stream = stdAnalyzer.reusableTokenStream(fieldName, // reader);/*w w w.jav a 2 s . co m*/ // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader); // stream = new PorterStemFilter(stream); // stream = new POSFilter(stream, new String[]{"NN.*"}); // return stream; // ICUTokenizer icut = new ICUTokenizer(reader); // TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); // stream = new StopFilter(Version.LUCENE_31, stream, // StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new PorterStemFilter(stream); // return new ICUFoldingFilter(stream); TokenStream stream = new ICUTokenizer(reader); stream = new DictionaryFilter(stream); stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); stream = new PorterStemFilter(stream); stream = new LowerCaseFilter(Version.LUCENE_31, stream); return new ICUFoldingFilter(stream); // TokenStream stream = engAnalyzer.tokenStream(fieldName, reader); // stream = new LowerCaseFilter(Version.LUCENE_31, stream); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); //// stream = new StopFilter(Version.LUCENE_31, stream, //// StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new EnglishPossessiveFilter(stream); // stream = new PorterStemFilter(stream); // return stream; }
From source file:edu.ur.lucene.analysis.StandardWithACIIFoldingFilter.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); src.setReplaceInvalidAcronym(replaceInvalidAcronym); TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, stopwords); tok = new ICUFoldingFilter(tok); return new TokenStreamComponents(src, tok) { @Override/*from w w w.ja va2s. c om*/ protected boolean reset(final Reader reader) throws IOException { src.setMaxTokenLength(StandardWithACIIFoldingFilter.this.maxTokenLength); return super.reset(reader); } }; }
From source file:io.aos.elasticsearch.analysis.icu.IcuFoldingTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter. // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here if (unicodeSetFilter != null) { Normalizer2 base = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), "utr30", Normalizer2.Mode.COMPOSE); UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter); unicodeSet.freeze();//from w ww . j av a 2 s .c o m Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet); return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered); } else { return new ICUFoldingFilter(tokenStream); } }
From source file:org.apache.solr.analysis.ICUFoldingFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream input) { return new ICUFoldingFilter(input); }
From source file:org.elasticsearch.indices.analysis.IcuIndicesAnalysis.java
License:Apache License
@Inject public IcuIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); indicesAnalysisService.tokenizerFactories().put("icu_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override//w ww .j ava 2 s . c o m public String name() { return "icu_tokenizer"; } @Override public Tokenizer create() { return new ICUTokenizer(); } })); indicesAnalysisService.tokenFilterFactories().put("icu_normalizer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "icu_normalizer"; } @Override public TokenStream create(TokenStream tokenStream) { return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); } })); indicesAnalysisService.tokenFilterFactories().put("icu_folding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "icu_folding"; } @Override public TokenStream create(TokenStream tokenStream) { return new ICUFoldingFilter(tokenStream); } })); indicesAnalysisService.tokenFilterFactories().put("icu_collation", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "icu_collation"; } @Override public TokenStream create(TokenStream tokenStream) { return new ICUCollationKeyFilter(tokenStream, Collator.getInstance()); } })); indicesAnalysisService.tokenFilterFactories().put("icu_transform", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "icu_transform"; } @Override public TokenStream create(TokenStream tokenStream) { return new ICUTransformFilter(tokenStream, Transliterator.getInstance("Null", Transliterator.FORWARD)); } })); indicesAnalysisService.charFilterFactories().put("icu_normalizer", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { @Override public String name() { return "icu_normalizer"; } @Override public Reader create(Reader reader) { return new ICUNormalizer2CharFilter(reader); } })); }
From source file:org.xbib.elasticsearch.index.analysis.icu.IcuFoldingTokenFilterFactory.java
License:Open Source License
@Override public TokenStream create(TokenStream tokenStream) { if (unicodeSetFilter != null) { // The ICUFoldingFilter is in fact implemented as a ICUNormalizer2Filter. // ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here Normalizer2 base = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), "utr30", Normalizer2.Mode.COMPOSE); UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter); unicodeSet.freeze();/*from w w w . ja v a 2 s . com*/ Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet); return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered); } else { return new ICUFoldingFilter(tokenStream); } }