List of usage examples for org.apache.lucene.analysis.en EnglishPossessiveFilter EnglishPossessiveFilter
public EnglishPossessiveFilter(TokenStream input)
From source file:com.hourglassapps.cpi_ii.stem.snowball.lucene.SnowballAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, and a {@link SnowballFilter} */ @Override//from w w w . jav a 2s. c o m public TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, tokenizer); // remove the possessive 's for english stemmers if (matchVersion.onOrAfter(Version.LUCENE_3_1) && (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))) result = new EnglishPossessiveFilter(result); // Use a special lowercase filter for turkish, the stemmer expects it. if (matchVersion.onOrAfter(Version.LUCENE_3_1) && name.equals("Turkish")) result = new TurkishLowerCaseFilter(result); else result = new LowerCaseFilter(matchVersion, result); if (stopSet != null) result = new StopFilter(matchVersion, result, stopSet); result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
From source file:com.mozilla.grouperfish.lucene.analysis.en.EnglishAnalyzer.java
License:Apache License
/** * Creates a/*from w w w . j a va2s. c om*/ * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, * {@link StopFilter} , {@link KeywordMarkerFilter} if a stem * exclusion set is provided and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for // us. if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new EnglishPossessiveFilter(result); } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (stem) { if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new PorterStemFilter(result); result = new StopFilter(matchVersion, result, stopwords); } return new TokenStreamComponents(source, result); }
From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); if (matchVersion.onOrAfter(Version.LUCENE_31)) { result = new EnglishPossessiveFilter(result); }/*from w w w .jav a 2s .c om*/ result = new LowerCaseFilter(matchVersion, result); ShingleAllStopFilter sf = new ShingleAllStopFilter(result, minNGram, maxNGram, stopwords); sf.setOutputUnigrams(outputUnigrams); if (!outputUnigrams) { sf.setOutputUnigramsIfNoShingles(false); } result = sf; if (stem) { if (!stemExclusionSet.isEmpty()) { result = new KeywordMarkerFilter(result, stemExclusionSet); } result = new PorterStemFilter(result); } return new TokenStreamComponents(source, result); }
From source file:com.romeikat.datamessie.core.processing.service.stemming.text.EnglishAnalyzer.java
License:Open Source License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which tokenizes all * the text in the provided {@link Reader}. * * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from an * {@link StandardTokenizer} filtered with {@link StandardFilter}, * {@link EnglishPossessiveFilter}, {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link PorterStemFilter}./* w ww .j a v a 2s.c om*/ */ @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source; source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); // Remove terms that do not contain any alphabetic character result = new NumberFilter(result); // Remove possessives (trailing 's) result = new EnglishPossessiveFilter(result); // Converting to lower case is not necessary as this is done before stemming // result = new LowerCaseFilter(result); // Remove stopwords result = new StopFilter(result, stopwords); // Mark keywords if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } // Stem result = new SnowballFilter(result, new German2Stemmer()); // Alternatives to the SnowballFilter: // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzer.java
License:Open Source License
/** * This configuration must match with the configuration used for the index! * * @param fieldName Document field//from www .j a va 2 s . co m * @return Token stream */ @SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new EnglishPossessiveFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, getStopwordSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.MinimalAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(result); result = new EnglishPossessiveFilter(result); result = new StopFilter(result, stopwords); result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:org.apache.solr.analysis.EnglishPossessiveFilterFactory.java
License:Apache License
public TokenStream create(TokenStream input) { return new EnglishPossessiveFilter(input); }
From source file:org.codelibs.elasticsearch.index.analysis.SnowballAnalyzer.java
License:Apache License
/** Constructs a {StandardTokenizer} filtered by a {@link StandardFilter}, a {LowerCaseFilter}, a {StopFilter}, and a {SnowballFilter} */// w w w . ja v a 2s .c o m @Override public TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream result = tokenizer; // remove the possessive 's for english stemmers if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) { result = new EnglishPossessiveFilter(result); } // Use a special lowercase filter for turkish, the stemmer expects it. if (name.equals("Turkish")) { result = new TurkishLowerCaseFilter(result); } else { result = new LowerCaseFilter(result); } if (stopSet != null) { result = new StopFilter(result, stopSet); } result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
From source file:org.elasticsearch.analysis.common.SnowballAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, and a {@link SnowballFilter} */ @Override/*from w ww . j a v a 2s .c o m*/ public TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new StandardTokenizer(); TokenStream result = tokenizer; // remove the possessive 's for english stemmers if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) result = new EnglishPossessiveFilter(result); // Use a special lowercase filter for turkish, the stemmer expects it. if (name.equals("Turkish")) result = new TurkishLowerCaseFilter(result); else result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(result, stopSet); result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }