List of usage examples for org.apache.lucene.analysis.standard StandardTokenizer StandardTokenizer
public StandardTokenizer()
From source file:MyStandardAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer src = new StandardTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(src); // tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override//w ww . j ava 2 s . c o m protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(MyStandardAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzer.java
License:Open Source License
/** * Creates//ww w .j a v a 2s . co m * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}, * {@link ArabicRootExtractorStemFilter}, {@link SetKeywordMarkerFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LATEST)) { source = new StandardTokenizer(); } else { source = new StandardTokenizer40(); } TokenStream result = new LowerCaseFilter(source); if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) { result = new DecimalDigitFilter(result); } // the order here is important: the stopword list is not normalized! result = new StopFilter(result, stopwords); result = new ArabicRootExtractorStemFilter(result); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return new TokenStreamComponents(source, result); }
From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.NgramAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new StandardTokenizer(); ShingleFilter sf = new ShingleFilter( new StopFilter(new LowerCaseFilter(new StandardFilter(tokenizer)), STOP_WORDS_SET)); sf.setMaxShingleSize(gap);//from w w w. java 2s. com // sf.setFillerToken(""); // sf.setOutputUnigrams(false); return new TokenStreamComponents(tokenizer, sf); }
From source file:com.romeikat.datamessie.core.base.util.TokenizerAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); final TokenStream result = new StandardFilter(source); return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.FullTextIndexingAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Convert to lower case result = new LowerCaseFilter(result); // Normalize German special characters result = new GermanNormalizationFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java
License:Open Source License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, * {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link PorterStemFilter}./*w w w .ja v a 2s . co m*/ */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source; source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Remove possessives (trailing 's) result = new EnglishPossessiveFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.GermanAnalyzer.java
License:Open Source License
/** * Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all * the text in the provided {@link Reader}. * * @return//from www .j av a2 s . c o m */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!exclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, exclusionSet); } // Normalize German special characters result = new KeywordAwareGermanNormalizationFilter(result); // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new GermanStemFilter(result); // result = new GermanLightStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzer.java
License:Open Source License
/** * This configuration must match with the configuration used for the index! * * @param fieldName Document field/*from w ww .j av a2s .com*/ * @return Token stream */ @SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new EnglishPossessiveFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, getStopwordSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzer.java
License:Open Source License
/** * This configuration must match with the configuration used for the index! * * @param fieldName Document field//from w ww . jav a2 s .c o m * @return Token stream */ @SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ElisionFilter(result, this.elisions); result = new LowerCaseFilter(result); result = new StopFilter(result, getStopwordSet()); result = new FrenchLightStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzer.java
License:Open Source License
/** * This configuration must match with the configuration used for the index! * * @param fieldName Document field/*from www .j ava 2 s .co m*/ * @return Token stream */ @SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); result = new StopFilter(result, getStopwordSet()); result = new GermanNormalizationFilter(result); result = new GermanLightStemFilter(result); return new TokenStreamComponents(source, result); }