Example usage for weka.core.tokenizers Tokenizer tokenize

List of usage examples for weka.core.tokenizers Tokenizer tokenize

Introduction

In this page you can find the example usage for weka.core.tokenizers Tokenizer tokenize.

Prototype

public abstract void tokenize(String s);

Source Link

Document

Sets the string to tokenize.

Usage

From source file:affective.core.Utils.java

License:Open Source License

/**
 * Tokenizes a String/* ww  w .  ja v a2 s  .co m*/
 * @param content the content
 * @param toLowerCase true for lowercasing the content
 * @param standarizeUrlsUsers true for standarizing urls and users
 * @param reduceRepeatedLetters true for reduing repeated letters
 * @param tokenizer the tokenizer
 * @param stemmer the stemmer
 * @param stop the stopwords handler
 * @return a list of tokens
 */
static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers,
        boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) {

    if (toLowerCase)
        content = content.toLowerCase();

    // if a letters appears two or more times it is replaced by only two
    // occurrences of it
    if (reduceRepeatedLetters)
        content = content.replaceAll("([a-z])\\1+", "$1$1");

    List<String> tokens = new ArrayList<String>();

    tokenizer.tokenize(content);
    for (; tokenizer.hasMoreElements();) {
        String token = tokenizer.nextElement();
        if (!stop.isStopword(token)) {

            if (standarizeUrlsUsers) {
                // Replace URLs to a generic URL
                if (token.matches("http.*|ww\\..*|www\\..*")) {
                    token = "http://www.url.com";
                }
                // Replaces user mentions to a generic user
                else if (token.matches("@.*")) {
                    token = "@user";
                }

            }

            tokens.add(stemmer.stem(token));
        }
    }

    return tokens;

}