Example usage for org.apache.lucene.analysis.en PorterStemFilter PorterStemFilter

List of usage examples for org.apache.lucene.analysis.en PorterStemFilter PorterStemFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.en PorterStemFilter PorterStemFilter.

Prototype

public PorterStemFilter(TokenStream in) 

Source Link

Usage

From source file:EnglishAnalyzerConfigurable.java

License:Apache License

/**
 * Creates a {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * //from  www. ja v  a  2 s  .c om
 * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link EnglishPossessiveFilter},
 *         {@link LowerCaseFilter}, {@link StopFilter} ,
 *         {@link SetKeywordMarkerFilter} if a stem exclusion set is provided
 *         and {@link PorterStemFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    // prior to this we get the classic behavior, standardfilter does it for us.

    result = new EnglishPossessiveFilter(matchVersion, result);

    if (this.doLowerCase)
        result = new LowerCaseFilter(matchVersion, result);

    if (this.doStopwordRemoval)
        result = new StopFilter(matchVersion, result, stopwords);

    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);

    if (this.stemmer == StemmerType.PORTER)
        result = new PorterStemFilter(result);
    else if (this.stemmer == StemmerType.KSTEM)
        result = new KStemFilter(result);

    return new TokenStreamComponents(source, result);
}

From source file:aos.lucene.analysis.positional.PositionalPorterStopAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    StopFilter stopFilter = new StopFilter(true, new LowerCaseTokenizer(reader), stopWords);
    stopFilter.setEnablePositionIncrements(true);
    return new PorterStemFilter(stopFilter);
}

From source file:br.pucminas.ri.jsearch.utils.PorterStemAnalyzer.java

License:Open Source License

@Override
protected TokenStreamComponents createComponents(String string) {
    Reader reader = new StringReader(string);
    LowerCaseTokenizer source = new LowerCaseTokenizer();
    source.setReader(reader);//  w  w  w . ja  va 2  s  . c  om
    StopFilter filter = new StopFilter(source, stopWords);
    PorterStemFilter stem = new PorterStemFilter(filter);
    return new TokenStreamComponents(source, stem);
}

From source file:cc.twittertools.index.TweetAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
    TokenStream filter = new LowerCaseEntityPreservingFilter(source);

    if (stemming) {
        // Porter stemmer ignores words which are marked as keywords
        filter = new PorterStemFilter(filter);
    }/* w  w  w  .j a v  a 2 s. c o m*/
    return new TokenStreamComponents(source, filter);
}

From source file:ci6226.CustormizedAnalyzer.java

@Override
protected CustormizedAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new StandardTokenizer(Version.LUCENE_47, reader);

    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new PorterStemFilter(filter);
    filter = new StopFilter(Version.LUCENE_47, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new Analyzer.TokenStreamComponents((Tokenizer) source, filter);
}

From source file:ci6226.myAnalyzer.java

@Override
protected myAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader);

    //TokenStream source = new LetterFilter(Version.LUCENE_47, reader);

    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new PorterStemFilter(filter);

    //TokenStream filter = new StopFilter(Version.LUCENE_47, source, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    //ilter = new StandardFilter(Version.LUCENE_47, source);
    //TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new StopFilter(Version.LUCENE_47, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

    return new TokenStreamComponents((Tokenizer) source, filter);
}

From source file:ci6226.StemmingAnalyzer.java

@Override
protected StemmingAnalyzer.TokenStreamComponents createComponents(String arg0, Reader reader) {
    TokenStream source = new WhitespaceTokenizer(Version.LUCENE_47, reader);

    TokenStream filter = new LowerCaseFilter(Version.LUCENE_47, source);
    filter = new PorterStemFilter(filter);

    return new Analyzer.TokenStreamComponents((Tokenizer) source, filter);
}

From source file:cn.tung.javacn.pinyin.SimpleChineseAnalyzer.java

License:Apache License

@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer tokenizer = new HMMChineseTokenizer(reader);
    TokenStream result = new PorterStemFilter(tokenizer);
    if (!stopWords.isEmpty()) {
        result = new StopFilter(result, stopWords);
    }/*from ww w  .  j a v  a  2 s .com*/
    return new TokenStreamComponents(tokenizer, result);
}

From source file:com.mathworks.xzheng.analysis.positional.PositionalPorterStopAnalyzer.java

License:Apache License

/**
 * Creates a new {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} instance for this analyzer.
 *
 * @param fieldName the name of the fields content passed to the
 *                  {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} sink as a reader
 * @param reader    the reader passed to the {@link org.apache.lucene.analysis.Tokenizer} constructor
 * @return the {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} for this analyzer.
 *//*from   w ww .  j  a  va2s.c o m*/
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer source = new LowerCaseTokenizer(Version.LUCENE_46, reader);
    TokenStream stopFilter = new StopFilter(Version.LUCENE_46, source,
            new CharArraySet(Version.LUCENE_46, stopWords, true));
    //stopFilter.setEnablePositionIncrements(true);
    stopFilter = new PorterStemFilter(stopFilter);
    return new TokenStreamComponents(source, stopFilter);
}

From source file:com.nec.scg.senseRanking.CountTextSimilarity.java

public Map<String, Float> CountTF_IDF(String str, Analyzer a) {
    Map<String, Float> termVector = new TreeMap<String, Float>();

    try {/*w w w .  j  ava 2s .  com*/
        TokenStream stream = a.tokenStream("content", new StringReader(str));
        PorterStemFilter filter = new PorterStemFilter(stream);
        CharTermAttribute cta = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        String strcat = null;
        int wordCount = 0;
        while (filter.incrementToken()) {
            strcat = cta.toString();
            // System.out.print("["+strcat+"]");
            if (!termVector.containsKey(strcat)) {
                termVector.put(strcat, 1f);
                wordCount++;
            } else {
                termVector.put(strcat, termVector.get(strcat) + 1);
                wordCount++;
            }
        }
        for (String ter : termVector.keySet()) {
            int hits = searchIndexforIDF(ter) + 1;
            float idf = (float) (Math.log(AllArticle * 1.0 / hits) + 1.0);
            float tf = termVector.get(ter) / wordCount;
            termVector.put(ter, tf * idf);
        }

        filter.end();
        stream.end();
        filter.close();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return termVector;
}