List of usage examples for org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter ASCIIFoldingFilter
public ASCIIFoldingFilter(TokenStream input)
From source file:ModifiedRomanianAnalyzer.java
License:Apache License
/** * Creates a//w w w. j a v a2 s . c om * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new ASCIIFoldingFilter(result); // if(!stemExclusionSet.isEmpty()) // result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new RomanianStemmer()); return new TokenStreamComponents(source, result); }
From source file:analyzers.FormalAnalyzer.java
License:Apache License
/** * Define how tokens are processed./*w w w . ja v a 2 s. com*/ * * @param fieldName required input * @param reader reader for document */ @Override protected Analyzer.TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer tokenizer = new StandardTokenizer(reader); TokenStream chain = tokenizer; if (!tokenOpts.disableAllFilters) { // the chain of token filters... chain = new StandardFilter(chain); // discard tokens based on their type attribute chain = new StandardTagFilter(chain, tokenOpts); // convert tokens to lowercase chain = new LowerCaseFilter(chain); // replace accented chars with non-accented ASCII equivalents chain = new ASCIIFoldingFilter(chain); // remove stop words (must come after lowercasing) chain = new StopFilter(chain, stopWordSet); // remove 's chain = new EnglishPossessiveFilter(Version.LATEST, chain); // spelling correction if (!spellingHashtable.isEmpty()) chain = new SpellingCorrectionFilter(chain, spellingHashtable); if (!tokenOpts.disableStemming) { // Krovets stemmer (smarter than the Porter stemmer) chain = new KStemFilter(chain); } } return new Analyzer.TokenStreamComponents(tokenizer, chain); }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.EnAnalyzer.java
License:Apache License
/** * * @param reader//www . ja v a 2 s .c o m * @return */ @Override protected TokenStreamComponents createComponents(Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.info("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); // ShingleFilter sf = new ShingleFilter(result, 2, 3); // sf.setFillerToken(null); // result = sf; TokenStreamComponents comp = new TokenStreamComponents(t, result); return comp; }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.NlAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(Reader reader) { Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t;// w w w .j a v a 2 s .c o m result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.info("No stopwordsfile provided, no stopword removal"); } result = new SnowballFilter(result, new DutchStemmer()); TokenStreamComponents comp = new TokenStreamComponents(t, result); return comp; }
From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.EnAnalyzer.java
License:Apache License
/** * * @param fieldName//www . jav a 2s .co m * @param reader * @return */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.warn("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); ShingleFilter sf = new ShingleFilter(result, 2, 3); sf.setFillerToken(null); return new TokenStreamComponents(t, sf); }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.lucene.analyzers.EnAnalyzer.java
License:Apache License
/** * * @param fieldName// w w w . ja v a 2 s . c o m * @param reader * @return */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.info("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); // ShingleFilter sf = new ShingleFilter(result, 2, 3); // sf.setFillerToken(null); // result = sf; TokenStreamComponents comp = new TokenStreamComponents(t, result); return comp; }
From source file:com.b2international.index.analyzer.ComponentTermAnalyzer.java
License:Apache License
private TokenFilter createFilterChain(final Tokenizer source) { return new ASCIIFoldingFilter(source); }
From source file:com.nutrisystem.orange.java.text.EnglishTextAnalyzer.java
License:Apache License
/** * Creates a/*from w w w. j av a2 s . c o m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which * tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link KeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new ClassicTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for // us. result = new EnglishPossessiveFilter(matchVersion, result); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new NumberFilter(result); result = new KStemFilter(result); result = new PorterStemFilter(result); //result = new ShingleFilter(result, 3); return new TokenStreamComponents(source, result); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.ASCIIEnglishAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.CharacterShingleAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new CharacterShingleTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new LowerCaseFilter(result); result = new ShingleFilter(result, 3); // result = new WordDelimiterFilter(result, WordDelimiterFilter.DIGIT, null); return new TokenStreamComponents(source, result); }