List of usage examples for org.apache.lucene.analysis.en PorterStemFilter PorterStemFilter
public PorterStemFilter(TokenStream in)
From source file:EnglishAnalyzerConfigurable.java
License:Apache License
/** * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * //from www. ja v a 2 s .c om * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} , * {@link SetKeywordMarkerFilter} if a stem exclusion set is provided * and {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); // prior to this we get the classic behavior, standardfilter does it for us. result = new EnglishPossessiveFilter(matchVersion, result); if (this.doLowerCase) result = new LowerCaseFilter(matchVersion, result); if (this.doStopwordRemoval) result = new StopFilter(matchVersion, result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); if (this.stemmer == StemmerType.PORTER) result = new PorterStemFilter(result); else if (this.stemmer == StemmerType.KSTEM) result = new KStemFilter(result); return new TokenStreamComponents(source, result); }
From source file:aos.lucene.analysis.positional.PositionalPorterStopAnalyzer.java
License:Apache License
@Override public TokenStream tokenStream(String fieldName, Reader reader) { StopFilter stopFilter = new StopFilter(true, new LowerCaseTokenizer(reader), stopWords); stopFilter.setEnablePositionIncrements(true); return new PorterStemFilter(stopFilter); }
From source file:br.pucminas.ri.jsearch.utils.PorterStemAnalyzer.java
License:Open Source License
@Override protected TokenStreamComponents createComponents(String string) { Reader reader = new StringReader(string); LowerCaseTokenizer source = new LowerCaseTokenizer(); source.setReader(reader);// w w w . ja va 2 s . c om StopFilter filter = new StopFilter(source, stopWords); PorterStemFilter stem = new PorterStemFilter(filter); return new TokenStreamComponents(source, stem); }
From source file:cc.twittertools.index.TweetAnalyzer.java
License:Apache License
@Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { Tokenizer source = new WhitespaceTokenizer(matchVersion, reader); TokenStream filter = new LowerCaseEntityPreservingFilter(source); if (stemming) { // Porter stemmer ignores words which are marked as keywords filter = new PorterStemFilter(filter); }/* w w w .j a v a 2 s. c o m*/ return new TokenStreamComponents(source, filter); }
From source file:ci6226.CustormizedAnalyzer.java
@Override protected CustormizedAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new StandardTokenizer(Version.LUCENE_47, reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new PorterStemFilter(filter); filter = new StopFilter(Version.LUCENE_47, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new Analyzer.TokenStreamComponents((Tokenizer) source, filter); }
From source file:ci6226.myAnalyzer.java
@Override protected myAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader); //TokenStream source = new LetterFilter(Version.LUCENE_47, reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new PorterStemFilter(filter); //TokenStream filter = new StopFilter(Version.LUCENE_47, source, StopAnalyzer.ENGLISH_STOP_WORDS_SET); //ilter = new StandardFilter(Version.LUCENE_47, source); //TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new StopFilter(Version.LUCENE_47, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); return new TokenStreamComponents((Tokenizer) source, filter); }
From source file:ci6226.StemmingAnalyzer.java
@Override protected StemmingAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) { TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader); TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source); filter = new PorterStemFilter(filter); return new Analyzer.TokenStreamComponents((Tokenizer) source, filter); }
From source file:cn.tung.javacn.pinyin.SimpleChineseAnalyzer.java
License:Apache License
@Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer tokenizer = new HMMChineseTokenizer(reader); TokenStream result = new PorterStemFilter(tokenizer); if (!stopWords.isEmpty()) { result = new StopFilter(result, stopWords); }/*from ww w . j a v a 2 s .com*/ return new TokenStreamComponents(tokenizer, result); }
From source file:com.mathworks.xzheng.analysis.positional.PositionalPorterStopAnalyzer.java
License:Apache License
/** * Creates a new {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} instance for this analyzer. * * @param fieldName the name of the fields content passed to the * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} sink as a reader * @param reader the reader passed to the {@link org.apache.lucene.analysis.Tokenizer} constructor * @return the {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} for this analyzer. *//*from w ww . j a va2s.c o m*/ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new LowerCaseTokenizer(Version.LUCENE_46, reader); TokenStream stopFilter = new StopFilter(Version.LUCENE_46, source, new CharArraySet(Version.LUCENE_46, stopWords, true)); //stopFilter.setEnablePositionIncrements(true); stopFilter = new PorterStemFilter(stopFilter); return new TokenStreamComponents(source, stopFilter); }
From source file:com.nec.scg.senseRanking.CountTextSimilarity.java
public Map<String, Float> CountTF_IDF(String str, Analyzer a) { Map<String, Float> termVector = new TreeMap<String, Float>(); try {/*w w w . j ava 2s . com*/ TokenStream stream = a.tokenStream("content", new StringReader(str)); PorterStemFilter filter = new PorterStemFilter(stream); CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class); filter.reset(); String strcat = null; int wordCount = 0; while (filter.incrementToken()) { strcat = cta.toString(); // System.out.print("["+strcat+"]"); if (!termVector.containsKey(strcat)) { termVector.put(strcat, 1f); wordCount++; } else { termVector.put(strcat, termVector.get(strcat) + 1); wordCount++; } } for (String ter : termVector.keySet()) { int hits = searchIndexforIDF(ter) + 1; float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0); float tf = termVector.get(ter) / wordCount; termVector.put(ter, tf * idf); } filter.end(); stream.end(); filter.close(); stream.close(); } catch (IOException e) { e.printStackTrace(); } return termVector; }