Example usage for weka.core.tokenizers Tokenizer hasMoreElements

List of usage examples for weka.core.tokenizers Tokenizer hasMoreElements

Introduction

In this page you can find the example usage for weka.core.tokenizers Tokenizer hasMoreElements.

Prototype

@Override
public abstract boolean hasMoreElements();

Source Link

Document

Tests if this enumeration contains more elements.

Usage

From source file:affective.core.Utils.java

License:Open Source License

/**
 * Tokenizes a String/*from   w  ww.  j ava  2s .  c om*/
 * @param content the content
 * @param toLowerCase true for lowercasing the content
 * @param standarizeUrlsUsers true for standarizing urls and users
 * @param reduceRepeatedLetters true for reduing repeated letters
 * @param tokenizer the tokenizer
 * @param stemmer the stemmer
 * @param stop the stopwords handler
 * @return a list of tokens
 */
static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers,
        boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) {

    if (toLowerCase)
        content = content.toLowerCase();

    // if a letters appears two or more times it is replaced by only two
    // occurrences of it
    if (reduceRepeatedLetters)
        content = content.replaceAll("([a-z])\\1+", "$1$1");

    List<String> tokens = new ArrayList<String>();

    tokenizer.tokenize(content);
    for (; tokenizer.hasMoreElements();) {
        String token = tokenizer.nextElement();
        if (!stop.isStopword(token)) {

            if (standarizeUrlsUsers) {
                // Replace URLs to a generic URL
                if (token.matches("http.*|ww\\..*|www\\..*")) {
                    token = "http://www.url.com";
                }
                // Replaces user mentions to a generic user
                else if (token.matches("@.*")) {
                    token = "@user";
                }

            }

            tokens.add(stemmer.stem(token));
        }
    }

    return tokens;

}