List of usage examples for org.apache.lucene.analysis.icu.segmentation ICUTokenizer ICUTokenizer
public ICUTokenizer(ICUTokenizerConfig config)
From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String field, Reader reader) { TokenStream stream = new ICUTokenizer(reader); return new ICUFoldingFilter(stream); }
From source file:edu.indiana.d2i.htrc.io.HTRCTermAnalyzer.java
License:Apache License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { TokenStream stream = new ICUTokenizer(reader); return new ICUFoldingFilter(stream); }
From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String fieldName, Reader reader) { // TokenStream stream = stdAnalyzer.tokenStream(fieldName, reader); // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader); // stream = new PorterStemFilter(stream); // stream = new POSFilter(stream, new String[]{"NN.*"}); // return stream; // ICUTokenizer icut = new ICUTokenizer(reader); // TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); // stream = new StopFilter(Version.LUCENE_31, stream, // StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new PorterStemFilter(stream); // return new ICUFoldingFilter(stream); TokenStream stream = new ICUTokenizer(reader); stream = new DictionaryFilter(stream); stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); stream = new PorterStemFilter(stream); stream = new LowerCaseFilter(Version.LUCENE_31, stream); return new ICUFoldingFilter(stream); // TokenStream stream = engAnalyzer.tokenStream(fieldName, reader); // stream = new LowerCaseFilter(Version.LUCENE_31, stream); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); //// stream = new StopFilter(Version.LUCENE_31, stream, //// StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new EnglishPossessiveFilter(stream); // stream = new PorterStemFilter(stream); // return stream; }
From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzer.java
License:Apache License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { // TokenStream stream = stdAnalyzer.reusableTokenStream(fieldName, // reader);//from w ww. ja v a2 s .c om // stream = new LowerCaseTokenizer(Version.LUCENE_31, reader); // stream = new PorterStemFilter(stream); // stream = new POSFilter(stream, new String[]{"NN.*"}); // return stream; // ICUTokenizer icut = new ICUTokenizer(reader); // TokenStream stream = new LowerCaseFilter(Version.LUCENE_31, icut); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); // stream = new StopFilter(Version.LUCENE_31, stream, // StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new PorterStemFilter(stream); // return new ICUFoldingFilter(stream); TokenStream stream = new ICUTokenizer(reader); stream = new DictionaryFilter(stream); stream = new StopFilter(Version.LUCENE_31, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); stream = new PorterStemFilter(stream); stream = new LowerCaseFilter(Version.LUCENE_31, stream); return new ICUFoldingFilter(stream); // TokenStream stream = engAnalyzer.tokenStream(fieldName, reader); // stream = new LowerCaseFilter(Version.LUCENE_31, stream); // stream = new RegexpFilter(stream, new String[]{"^[a-z]+$"}); //// stream = new StopFilter(Version.LUCENE_31, stream, //// StopAnalyzer.ENGLISH_STOP_WORDS_SET, false); // stream = new EnglishPossessiveFilter(stream); // stream = new PorterStemFilter(stream); // return stream; }
From source file:org.apache.solr.analysis.ICUTokenizerFactory.java
License:Apache License
@Override public Tokenizer create(Reader input) { return new ICUTokenizer(input); }
From source file:org.voyanttools.trombone.lucene.analysis.LexicalAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("wordBoundaries")) { Tokenizer tokenizer = new LowerCaseTokenizer(); return new TokenStreamComponents(tokenizer); } else if (fieldName.equals(TokenType.lexical.name()) && parameters.getParameterValue("tokenization", "").equals("whitespace")) { Tokenizer tokenizer = new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer); } else if (lang.startsWith("zh") && fieldName.equals(TokenType.lexical.name())) { // Chinese Tokenizer tokenizer = new HMMChineseTokenizer(); return new TokenStreamComponents(tokenizer, tokenizer); } else if (lang.equals("bo") && fieldName.equals(TokenType.lexical.name())) { // Tibetan Tokenizer tokenizer = new ICUTokenizer(new TromboneICUTokenizerConfig(true, true, lang)); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else if (lang.equals("grc") /* Ancient Greek */ || lang.equals("el") /* Modern Greek */) { Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new GreekCustomFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); } else { // default case Tokenizer tokenizer = new ICUTokenizer(); TokenStream stream = new LowerCaseFilter(tokenizer); return new TokenStreamComponents(tokenizer, stream); }/*from w w w . j av a 2 s . c o m*/ }