List of usage examples for org.apache.lucene.analysis.snowball SnowballFilter SnowballFilter
public SnowballFilter(TokenStream in, String name)
From source file:org.elasticsearch.index.analysis.SnowballAnalyzer.java
License:Apache License
/** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, and a {@link SnowballFilter} */ @Override/*from w w w . ja va2 s. c o m*/ public TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer; if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { tokenizer = new StandardTokenizer(); } else { tokenizer = new StandardTokenizer40(); } TokenStream result = tokenizer; // remove the possessive 's for english stemmers if (name.equals("English") || name.equals("Porter") || name.equals("Lovins")) result = new EnglishPossessiveFilter(result); // Use a special lowercase filter for turkish, the stemmer expects it. if (name.equals("Turkish")) result = new TurkishLowerCaseFilter(result); else result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(result, stopSet); result = new SnowballFilter(result, name); return new TokenStreamComponents(tokenizer, result); }
From source file:org.elasticsearch.index.analysis.StemmerTokenFilterFactory.java
License:Apache License
@Override public TokenStream create(TokenStream tokenStream) { if ("arabic".equalsIgnoreCase(language)) { return new ArabicStemFilter(tokenStream); } else if ("armenian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new ArmenianStemmer()); } else if ("basque".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new BasqueStemmer()); } else if ("brazilian".equalsIgnoreCase(language)) { return new BrazilianStemFilter(tokenStream); } else if ("bulgarian".equalsIgnoreCase(language)) { return new BulgarianStemFilter(tokenStream); } else if ("catalan".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new CatalanStemmer()); } else if ("czech".equalsIgnoreCase(language)) { return new CzechStemFilter(tokenStream); } else if ("danish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DanishStemmer()); } else if ("dutch".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new DutchStemmer()); } else if ("english".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new EnglishStemmer()); } else if ("finnish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FinnishStemmer()); } else if ("french".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new FrenchStemmer()); } else if ("german".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new GermanStemmer()); } else if ("german2".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new German2Stemmer()); } else if ("hungarian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new HungarianStemmer()); } else if ("italian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new ItalianStemmer()); } else if ("kp".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new KpStemmer()); } else if ("kstem".equalsIgnoreCase(language)) { return new KStemFilter(tokenStream); } else if ("lovins".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new LovinsStemmer()); } else if ("latvian".equalsIgnoreCase(language)) { return new LatvianStemFilter(tokenStream); } else if ("norwegian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new NorwegianStemmer()); } else if ("minimal_norwegian".equalsIgnoreCase(language) || "minimalNorwegian".equals(language)) { return new NorwegianMinimalStemFilter(tokenStream); } else if ("porter".equalsIgnoreCase(language)) { return new PorterStemFilter(tokenStream); } else if ("porter2".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new PorterStemmer()); } else if ("portuguese".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new PortugueseStemmer()); } else if ("romanian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new RomanianStemmer()); } else if ("russian".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new RussianStemmer()); } else if ("spanish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SpanishStemmer()); } else if ("swedish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new SwedishStemmer()); } else if ("turkish".equalsIgnoreCase(language)) { return new SnowballFilter(tokenStream, new TurkishStemmer()); } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(version, tokenStream); } else if ("light_finish".equalsIgnoreCase(language) || "lightFinish".equalsIgnoreCase(language)) { // leaving this for backward compatibility return new FinnishLightStemFilter(tokenStream); } else if ("light_finnish".equalsIgnoreCase(language) || "lightFinnish".equalsIgnoreCase(language)) { return new FinnishLightStemFilter(tokenStream); } else if ("light_french".equalsIgnoreCase(language) || "lightFrench".equalsIgnoreCase(language)) { return new FrenchLightStemFilter(tokenStream); } else if ("minimal_french".equalsIgnoreCase(language) || "minimalFrench".equalsIgnoreCase(language)) { return new FrenchMinimalStemFilter(tokenStream); } else if ("light_german".equalsIgnoreCase(language) || "lightGerman".equalsIgnoreCase(language)) { return new GermanLightStemFilter(tokenStream); } else if ("minimal_german".equalsIgnoreCase(language) || "minimalGerman".equalsIgnoreCase(language)) { return new GermanMinimalStemFilter(tokenStream); } else if ("hindi".equalsIgnoreCase(language)) { return new HindiStemFilter(tokenStream); } else if ("light_hungarian".equalsIgnoreCase(language) || "lightHungarian".equalsIgnoreCase(language)) { return new HungarianLightStemFilter(tokenStream); } else if ("indonesian".equalsIgnoreCase(language)) { return new IndonesianStemFilter(tokenStream); } else if ("light_italian".equalsIgnoreCase(language) || "lightItalian".equalsIgnoreCase(language)) { return new ItalianLightStemFilter(tokenStream); } else if ("light_portuguese".equalsIgnoreCase(language) || "lightPortuguese".equalsIgnoreCase(language)) { return new PortugueseLightStemFilter(tokenStream); } else if ("minimal_portuguese".equalsIgnoreCase(language) || "minimalPortuguese".equalsIgnoreCase(language)) { return new PortugueseMinimalStemFilter(tokenStream); } else if ("portuguese".equalsIgnoreCase(language)) { return new PortugueseStemFilter(tokenStream); } else if ("light_russian".equalsIgnoreCase(language) || "lightRussian".equalsIgnoreCase(language)) { return new RussianLightStemFilter(tokenStream); } else if ("light_spanish".equalsIgnoreCase(language) || "lightSpanish".equalsIgnoreCase(language)) { return new SpanishLightStemFilter(tokenStream); } else if ("light_swedish".equalsIgnoreCase(language) || "lightSwedish".equalsIgnoreCase(language)) { return new SwedishLightStemFilter(tokenStream); } else if ("greek".equalsIgnoreCase(language)) { return new GreekStemFilter(tokenStream); }/*from w w w. jav a 2s .c o m*/ return new SnowballFilter(tokenStream, language); }
From source file:org.lambda3.indra.core.IndraAnalyzer.java
License:Open Source License
private TokenStream getStemmerFilter(String lang, int times, TokenStream stream) { SnowballProgram stemmer = getStemmer(lang); if (stemmer != null && times > 0) { for (int i = 0; i < times; i++) { stream = new SnowballFilter(stream, stemmer); }/*from w w w .ja v a2 s .co m*/ } return stream; }
From source file:org.lexevs.dao.index.indexer.LuceneLoaderCode.java
License:Open Source License
public static PerFieldAnalyzerWrapper getAnaylzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); //add a literal analyzer -- keep all special characters analyzerPerField.put(LITERAL_PROPERTY_VALUE_FIELD, literalAnalyzer); analyzerPerField.put(LITERAL_AND_REVERSE_PROPERTY_VALUE_FIELD, literalAnalyzer); //treat as string field by analyzing with the KeywordAnalyzer analyzerPerField.put(UNIQUE_ID, new KeywordAnalyzer()); analyzerPerField.put(ENTITY_TYPE, new KeywordAnalyzer()); analyzerPerField.put("isPreferred", new KeywordAnalyzer()); analyzerPerField.put(SQLTableConstants.TBLCOL_ENTITYCODENAMESPACE, new KeywordAnalyzer()); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override// ww w.ja va2s .com protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, false); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(DOUBLE_METAPHONE_PROPERTY_VALUE_FIELD, temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(NORM_PROPERTY_VALUE_FIELD, temp); } catch (NoClassDefFoundError e) { // } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(STEMMING_PROPERTY_VALUE_FIELD, temp); } final CharArraySet dividerList = new CharArraySet(10, true); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); Analyzer qualifierAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String arg0) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new LowerCaseFilter(source); Pattern pattern = Pattern.compile("\\-|\\;|\\(|\\)|\\{|\\}|\\[|\\]|\\<|\\>|\\||(\\<\\:\\>)"); filter = new PatternReplaceFilter(filter, pattern, " ", true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put("sources", sa); analyzerPerField.put("usageContexts", sa); analyzerPerField.put("qualifiers", qualifierAnalyzer); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:org.lexevs.dao.index.metadata.BaseMetaDataLoader.java
License:Open Source License
public static Analyzer getMetadataAnalyzer() { Map<String, Analyzer> analyzerPerField = new HashMap<>(); if (doubleMetaphoneEnabled_) { Analyzer temp = new Analyzer() { @Override//from w w w .java 2 s .c o m protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new DoubleMetaphoneFilter(filter, 4, true); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(doubleMetaphonePrefix_ + "propertyValue", temp); } if (normEnabled_) { try { Analyzer temp = new StandardAnalyzer(CharArraySet.EMPTY_SET); analyzerPerField.put(normPrefix_ + "propertyValue", temp); } catch (NoClassDefFoundError e) { // norm is not available normEnabled_ = false; } } if (stemmingEnabled_) { Analyzer temp = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; analyzerPerField.put(stemmingPrefix_ + "propertyValue", temp); } // these fields just get simple analyzing. List<String> dividerList = new ArrayList<String>(); dividerList.add(STRING_TOKENIZER_TOKEN); Analyzer sa = new StandardAnalyzer(new CharArraySet(dividerList, true)); analyzerPerField.put("parentContainers", sa); // no stop words, default character removal set. PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), analyzerPerField); return analyzer; }
From source file:org.lexevs.dao.indexer.lucene.analyzers.SnowballAnalyzerTest.java
License:Open Source License
@Test public void testDontKeepOrigional() throws Exception { Analyzer temp = new Analyzer() { @Override//from w w w.ja v a 2 s . c om protected TokenStreamComponents createComponents(String fieldName) { final StandardTokenizer source = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); source.setMaxTokenLength(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); TokenStream filter = new StandardFilter(source); filter = new LowerCaseFilter(filter); filter = new StopFilter(filter, StandardAnalyzer.STOP_WORDS_SET); filter = new SnowballFilter(filter, "English"); return new TokenStreamComponents(source, filter); } }; String input = new String("The trees have Leaves!"); String[] output = { "tree", "have", "leav" }; BaseTokenStreamTestCase.assertAnalyzesTo(temp, input, output); }
From source file:org.meresco.lucene.analysis.MerescoDutchStemmingAnalyzer.java
License:Open Source License
@Override public TokenStream post_analyzer(String fieldName, TokenStream tok) { if (stemmingFields != null && stemmingFields.indexOf(fieldName) == -1) return tok; tok = new KeywordRepeatFilter(tok); // repeat every word as term and as keyword tok = new SnowballFilter(tok, new DutchStemmer()); // ignores keywords tok = new RemoveDuplicatesTokenFilter(tok); // removes one if keyword and term are still the same return tok;//ww w . j a va 2 s .c o m }
From source file:org.silverpeas.core.index.indexing.model.WAAnalyzer.java
License:Open Source License
/** * Returns a tokens stream built on top of the given reader. * *//*w ww . jav a2 s.c o m*/ @Override protected TokenStreamComponents createComponents(final String s) { final Tokenizer source = new StandardTokenizer(); // remove 's and . from token TokenStream result = new StandardFilter(source); result = new LowerCaseFilter(result); // remove some unexplicit terms result = new StopFilter(result, FrenchAnalyzer.getDefaultStopSet()); // remove [cdjlmnst-qu]' from token result = new ElisionFilter(result, FrenchAnalyzer.DEFAULT_ARTICLES); if (snowballUsed) { // Important! Strings given to Snowball filter must contains accents // so accents must be removed after stemmer have done the job // ignoring singular/plural, male/female and conjugated forms result = new SnowballFilter(result, stemmer); } // remove accents result = new ASCIIFoldingFilter(result); return new TokenStreamComponents(source, result); }
From source file:org.silverpeas.search.indexEngine.model.WAAnalyzer.java
License:Open Source License
/** * Returns a tokens stream built on top of the given reader. * * @param reader/*w ww. ja v a 2 s .c o m*/ * @return */ public TokenStream tokenStream(Reader reader) { TokenStream result = new SilverTokenizer(reader); result = new StandardFilter(Version.LUCENE_36, result); // remove 's and . from token result = new LowerCaseFilter(Version.LUCENE_36, result); result = new StopFilter(Version.LUCENE_36, result, stopWords); // remove some unexplicit terms // according to the language result = new ElisionFilter(Version.LUCENE_36, result); // remove [cdjlmnst-qu]' from token if (snowballUsed) { // Important! Strings given to Snowball filter must contains accents // so accents must be removed after stemmer have done the job // ignoring singular/plural, male/female and conjugated forms result = new SnowballFilter(result, stemmer); } // remove accents result = new ASCIIFoldingFilter(result); return result; }
From source file:org.splevo.vpm.analyzer.semantic.lucene.Stemming.java
License:Open Source License
/** * Wrap the current stream with the configured stemming option. * * @param stream//from www . j av a2 s. co m * The token stream to wrap. * @param stemming * The stemming option selected. * @return The wrapped stream, or the stream itself in case of none configured. */ public static TokenStream wrapStemmingFilter(TokenStream stream, Stemming stemming) { switch (stemming) { case SNOWBALL_PORTER: return new SnowballFilter(stream, new EnglishStemmer()); case PORTER: return new PorterStemFilter(stream); case KSTEM: return new KStemFilter(stream); case SSTEMMER: return new EnglishMinimalStemFilter(stream); case PLING: return new PlingStemmingFilter(stream); case NONE: default: return stream; } }