List of usage examples for org.apache.lucene.analysis.snowball SnowballFilter SnowballFilter
public SnowballFilter(TokenStream in, String name)
From source file:ModifiedRomanianAnalyzer.java
License:Apache License
/** * Creates a/*from w w w . ja va 2 s .co m*/ * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); result = new ASCIIFoldingFilter(result); // if(!stemExclusionSet.isEmpty()) // result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new RomanianStemmer()); return new TokenStreamComponents(source, result); }
From source file:RomanianAnalyzer.java
License:Apache License
/** * Creates a/*from w w w . j av a2 s . c o m*/ * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new OldRomanianStemmer()); result = new ASCIIFoldingFilter(result); return new TokenStreamComponents(source, result); }
From source file:analysis.FtpFilePathAnalyzer.java
License:Apache License
/** * Constructs a {@link StandardTokenizer} filtered by a * {@link StandardFilter}, a {@link LowerCaseFilter} and a * {@link StopFilter}.// w w w.j a v a2s .com */ public TokenStream tokenStream(String fieldName, Reader reader) { CharFilter filter = new LowercaseCharFilter(reader); filter = new MappingCharFilter(RECOVERY_MAP, filter); StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_30, filter); tokenStream.setMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); // result = new LowerCaseFilter(result); result = getStopFilter(result); result = new SnowballFilter(result, STEMMER); return result; }
From source file:analysis.FtpFilePathAnalyzer.java
License:Apache License
@SuppressWarnings("deprecation") public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); CharFilter filter = new LowercaseCharFilter(reader); filter = new MappingCharFilter(RECOVERY_MAP, filter); streams.tokenStream = new StandardTokenizer(Version.LUCENE_30, filter); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); // streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = getStopFilter(streams.filteredTokenStream); streams.filteredTokenStream = new SnowballFilter(streams.filteredTokenStream, STEMMER); } else {/*from w w w. ja v a 2s .c o m*/ CharFilter filter = new LowercaseCharFilter(reader); filter = new MappingCharFilter(RECOVERY_MAP, filter); streams.tokenStream.reset(filter); } streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); return streams.filteredTokenStream; }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.EnAnalyzer.java
License:Apache License
/** * * @param reader/*from w w w .jav a2s . c o m*/ * @return */ @Override protected TokenStreamComponents createComponents(Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.info("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); // ShingleFilter sf = new ShingleFilter(result, 2, 3); // sf.setFillerToken(null); // result = sf; TokenStreamComponents comp = new TokenStreamComponents(t, result); return comp; }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.NlAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(Reader reader) { Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t;/* www .j a v a2s.c o m*/ result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.info("No stopwordsfile provided, no stopword removal"); } result = new SnowballFilter(result, new DutchStemmer()); TokenStreamComponents comp = new TokenStreamComponents(t, result); return comp; }
From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.EnAnalyzer.java
License:Apache License
/** * * @param fieldName/*ww w .j a v a2 s . c om*/ * @param reader * @return */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.warn("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); ShingleFilter sf = new ShingleFilter(result, 2, 3); sf.setFillerToken(null); return new TokenStreamComponents(t, sf); }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.lucene.analyzers.EnAnalyzer.java
License:Apache License
/** * * @param fieldName/*from w w w . j a va 2s.c o m*/ * @param reader * @return */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //reader = new HTMLStripCharFilter(reader); Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader); TokenStream result = t; //result = new SynonymFilter(result, synonyms, true); result = new StandardFilter(Config.LUCENE_VERSION, result); result = new LowerCaseFilter(Config.LUCENE_VERSION, result); result = new TrimFilter(Config.LUCENE_VERSION, result); result = new ASCIIFoldingFilter(result); if (stopwords != null) { result = new StopFilter(Config.LUCENE_VERSION, result, stopwords); } else { logger.info("No stopwordsfile provided, no stopword removal"); } //result = new LowerCaseFilter(Version.LUCENE_46, result); result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result); //result = new PorterStemFilter(result); result = new SnowballFilter(result, new EnglishStemmer()); // ShingleFilter sf = new ShingleFilter(result, 2, 3); // sf.setFillerToken(null); // result = sf; TokenStreamComponents comp = new TokenStreamComponents(t, result); return comp; }
From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String field, Reader reader) { TokenStream ts = new HSearchTokenizer(Version.LUCENE_36, reader); ts = new LowerCaseFilter(Version.LUCENE_36, ts); SynonymMap smap = null;/*from www .ja v a 2s . com*/ try { if (null != conceptWithPipeSeparatedSynonums) { SynonymMap.Builder sb = new SynonymMap.Builder(true); List<String> tempList = new ArrayList<String>(); for (String concept : conceptWithPipeSeparatedSynonums.keySet()) { tempList.clear(); LineReaderUtil.fastSplit(tempList, conceptWithPipeSeparatedSynonums.get(concept), this.conceptWordSeparator); for (String syn : tempList) { int synLen = (null == syn) ? 0 : syn.length(); if (synLen == 0) continue; sb.add(new CharsRef(syn), new CharsRef(concept), false); } } if (conceptWithPipeSeparatedSynonums.size() > 0) { smap = sb.build(); if (null != smap) ts = new SynonymFilter(ts, smap, true); } } if (isStopFilterEnabled) { int stopwordsT = (null == stopwords) ? 0 : stopwords.size(); if (stopwordsT > 0) { ts = new StopFilter(Version.LUCENE_36, ts, stopwords); } } if (isAccentFilterEnabled) ts = new ASCIIFoldingFilter(ts); if (isSnoballStemEnabled) ts = new SnowballFilter(ts, new EnglishStemmer()); return ts; } catch (IOException ex) { ex.printStackTrace(System.err); throw new NullPointerException(ex.toString()); } }
From source file:com.duroty.lucene.analysis.AnalyzerISOLatin1.java
License:Open Source License
/** * DOCUMENT ME!/*from w w w. ja v a2s.co m*/ * * @param fieldName DOCUMENT ME! * @param reader DOCUMENT ME! * * @return DOCUMENT ME! */ public final TokenStream tokenStream(String fieldName, Reader reader) { // The token stream that will be returned. TokenStream result; // Builds the chain... /*result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result);*/ result = new RdLowerCaseTokenizer(reader); if (stopTable != null) { result = new StopFilter(result, stopTable); } else { } result = new ISOLatin1AccentFilter(result); result = new SnowballFilter(result, "English"); result = new SnowballFilter(result, "Spanish"); //result = new SnowballFilter(result, "French"); //result = new SnowballFilter(result, "Italian"); return result; }