Example usage for weka.core.stopwords StopwordsHandler isStopword

List of usage examples for weka.core.stopwords StopwordsHandler isStopword

Introduction

In this page you can find the example usage for weka.core.stopwords StopwordsHandler isStopword.

Prototype

public boolean isStopword(String word);

Source Link

Document

Returns true if the given string is a stop word.

Usage

From source file:affective.core.Utils.java

License:Open Source License

/**
 * Tokenizes a String//from   w w w . j  a  v a 2  s . c o  m
 * @param content the content
 * @param toLowerCase true for lowercasing the content
 * @param standarizeUrlsUsers true for standarizing urls and users
 * @param reduceRepeatedLetters true for reduing repeated letters
 * @param tokenizer the tokenizer
 * @param stemmer the stemmer
 * @param stop the stopwords handler
 * @return a list of tokens
 */
static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers,
        boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) {

    if (toLowerCase)
        content = content.toLowerCase();

    // if a letters appears two or more times it is replaced by only two
    // occurrences of it
    if (reduceRepeatedLetters)
        content = content.replaceAll("([a-z])\\1+", "$1$1");

    List<String> tokens = new ArrayList<String>();

    tokenizer.tokenize(content);
    for (; tokenizer.hasMoreElements();) {
        String token = tokenizer.nextElement();
        if (!stop.isStopword(token)) {

            if (standarizeUrlsUsers) {
                // Replace URLs to a generic URL
                if (token.matches("http.*|ww\\..*|www\\..*")) {
                    token = "http://www.url.com";
                }
                // Replaces user mentions to a generic user
                else if (token.matches("@.*")) {
                    token = "@user";
                }

            }

            tokens.add(stemmer.stem(token));
        }
    }

    return tokens;

}