Example usage for org.apache.lucene.analysis.snowball SnowballFilter SnowballFilter

List of usage examples for org.apache.lucene.analysis.snowball SnowballFilter SnowballFilter

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.snowball SnowballFilter SnowballFilter.

Prototype

public SnowballFilter(TokenStream in, String name) 

Source Link

Document

Construct the named stemming filter.

Usage

From source file:ModifiedRomanianAnalyzer.java

License:Apache License

/**
 * Creates a/*from   w  w w . ja va  2  s  .co  m*/
 * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 *         , {@link KeywordMarkerFilter} if a stem exclusion set is
 *         provided and {@link SnowballFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    result = new ASCIIFoldingFilter(result);
    //  if(!stemExclusionSet.isEmpty())
    //  result = new KeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new RomanianStemmer());
    return new TokenStreamComponents(source, result);
}

From source file:RomanianAnalyzer.java

License:Apache License

/**
 * Creates a/*from w w  w  . j  av a2 s  .  c  o  m*/
 * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 * which tokenizes all the text in the provided {@link Reader}.
 * 
 * @return A
 *         {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
 *         built from an {@link StandardTokenizer} filtered with
 *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
 *         , {@link KeywordMarkerFilter} if a stem exclusion set is
 *         provided and {@link SnowballFilter}.
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
    TokenStream result = new StandardFilter(matchVersion, source);
    result = new LowerCaseFilter(matchVersion, result);
    result = new StopFilter(matchVersion, result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new KeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new OldRomanianStemmer());

    result = new ASCIIFoldingFilter(result);
    return new TokenStreamComponents(source, result);
}

From source file:analysis.FtpFilePathAnalyzer.java

License:Apache License

/**
 * Constructs a {@link StandardTokenizer} filtered by a
 * {@link StandardFilter}, a {@link LowerCaseFilter} and a
 * {@link StopFilter}.// w w w.j  a v a2s .com
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    CharFilter filter = new LowercaseCharFilter(reader);
    filter = new MappingCharFilter(RECOVERY_MAP, filter);
    StandardTokenizer tokenStream = new StandardTokenizer(Version.LUCENE_30, filter);
    tokenStream.setMaxTokenLength(maxTokenLength);
    TokenStream result = new StandardFilter(tokenStream);
    //   result = new LowerCaseFilter(result);
    result = getStopFilter(result);
    result = new SnowballFilter(result, STEMMER);
    return result;
}

From source file:analysis.FtpFilePathAnalyzer.java

License:Apache License

@SuppressWarnings("deprecation")
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        CharFilter filter = new LowercaseCharFilter(reader);
        filter = new MappingCharFilter(RECOVERY_MAP, filter);
        streams.tokenStream = new StandardTokenizer(Version.LUCENE_30, filter);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        //       streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = getStopFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new SnowballFilter(streams.filteredTokenStream, STEMMER);
    } else {/*from   w  w  w.  ja v  a  2s  .c o  m*/
        CharFilter filter = new LowercaseCharFilter(reader);
        filter = new MappingCharFilter(RECOVERY_MAP, filter);
        streams.tokenStream.reset(filter);
    }
    streams.tokenStream.setMaxTokenLength(maxTokenLength);

    streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);

    return streams.filteredTokenStream;
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.EnAnalyzer.java

License:Apache License

/**
 *
 * @param reader/*from w  w w .jav  a2s .  c o m*/
 * @return
 */
@Override
protected TokenStreamComponents createComponents(Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.info("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    //        ShingleFilter sf = new ShingleFilter(result, 2, 3);
    //        sf.setFillerToken(null);
    //        result = sf;
    TokenStreamComponents comp = new TokenStreamComponents(t, result);
    return comp;
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.lucene.analyzers.NlAnalyzer.java

License:Apache License

@Override
protected TokenStreamComponents createComponents(Reader reader) {
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;/*  www .j  a v  a2s.c  o  m*/

    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.info("No stopwordsfile provided, no stopword removal");
    }

    result = new SnowballFilter(result, new DutchStemmer());

    TokenStreamComponents comp = new TokenStreamComponents(t, result);
    return comp;
}

From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.EnAnalyzer.java

License:Apache License

/**
 *
 * @param fieldName/*ww w .j a  v  a2  s . c om*/
 * @param reader
 * @return
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.warn("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    ShingleFilter sf = new ShingleFilter(result, 2, 3);
    sf.setFillerToken(null);
    return new TokenStreamComponents(t, sf);

}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.lucene.analyzers.EnAnalyzer.java

License:Apache License

/**
 *
 * @param fieldName/*from w w w . j  a va  2s.c o m*/
 * @param reader
 * @return
 */
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    //reader = new HTMLStripCharFilter(reader);
    Tokenizer t = new StandardTokenizer(Config.LUCENE_VERSION, reader);
    TokenStream result = t;

    //result = new SynonymFilter(result, synonyms, true);
    result = new StandardFilter(Config.LUCENE_VERSION, result);
    result = new LowerCaseFilter(Config.LUCENE_VERSION, result);
    result = new TrimFilter(Config.LUCENE_VERSION, result);
    result = new ASCIIFoldingFilter(result);
    if (stopwords != null) {
        result = new StopFilter(Config.LUCENE_VERSION, result, stopwords);
    } else {
        logger.info("No stopwordsfile provided, no stopword removal");
    }
    //result = new LowerCaseFilter(Version.LUCENE_46, result);
    result = new EnglishPossessiveFilter(Config.LUCENE_VERSION, result);
    //result = new PorterStemFilter(result);
    result = new SnowballFilter(result, new EnglishStemmer());
    //        ShingleFilter sf = new ShingleFilter(result, 2, 3);
    //        sf.setFillerToken(null);
    //        result = sf;
    TokenStreamComponents comp = new TokenStreamComponents(t, result);
    return comp;
}

From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java

License:Apache License

@Override
public TokenStream tokenStream(String field, Reader reader) {

    TokenStream ts = new HSearchTokenizer(Version.LUCENE_36, reader);
    ts = new LowerCaseFilter(Version.LUCENE_36, ts);

    SynonymMap smap = null;/*from  www  .ja v  a 2s  . com*/
    try {
        if (null != conceptWithPipeSeparatedSynonums) {
            SynonymMap.Builder sb = new SynonymMap.Builder(true);
            List<String> tempList = new ArrayList<String>();

            for (String concept : conceptWithPipeSeparatedSynonums.keySet()) {
                tempList.clear();
                LineReaderUtil.fastSplit(tempList, conceptWithPipeSeparatedSynonums.get(concept),
                        this.conceptWordSeparator);
                for (String syn : tempList) {
                    int synLen = (null == syn) ? 0 : syn.length();
                    if (synLen == 0)
                        continue;
                    sb.add(new CharsRef(syn), new CharsRef(concept), false);
                }
            }
            if (conceptWithPipeSeparatedSynonums.size() > 0) {
                smap = sb.build();
                if (null != smap)
                    ts = new SynonymFilter(ts, smap, true);
            }
        }

        if (isStopFilterEnabled) {
            int stopwordsT = (null == stopwords) ? 0 : stopwords.size();
            if (stopwordsT > 0) {
                ts = new StopFilter(Version.LUCENE_36, ts, stopwords);
            }
        }

        if (isAccentFilterEnabled)
            ts = new ASCIIFoldingFilter(ts);
        if (isSnoballStemEnabled)
            ts = new SnowballFilter(ts, new EnglishStemmer());

        return ts;

    } catch (IOException ex) {
        ex.printStackTrace(System.err);
        throw new NullPointerException(ex.toString());
    }
}

From source file:com.duroty.lucene.analysis.AnalyzerISOLatin1.java

License:Open Source License

/**
 * DOCUMENT ME!/*from  w w w.  ja  v  a2s.co  m*/
 *
 * @param fieldName DOCUMENT ME!
 * @param reader DOCUMENT ME!
 *
 * @return DOCUMENT ME!
 */
public final TokenStream tokenStream(String fieldName, Reader reader) {
    // The token stream that will be returned.
    TokenStream result;

    // Builds the chain...
    /*result = new StandardTokenizer(reader);
    result = new StandardFilter(result);
    result = new LowerCaseFilter(result);*/
    result = new RdLowerCaseTokenizer(reader);

    if (stopTable != null) {
        result = new StopFilter(result, stopTable);
    } else {
    }

    result = new ISOLatin1AccentFilter(result);

    result = new SnowballFilter(result, "English");
    result = new SnowballFilter(result, "Spanish");

    //result = new SnowballFilter(result, "French");
    //result = new SnowballFilter(result, "Italian");
    return result;
}