List of usage examples for weka.core.stopwords StopwordsHandler isStopword
public boolean isStopword(String word);
From source file:affective.core.Utils.java
License:Open Source License
/** * Tokenizes a String//from w w w . j a v a 2 s . c o m * @param content the content * @param toLowerCase true for lowercasing the content * @param standarizeUrlsUsers true for standarizing urls and users * @param reduceRepeatedLetters true for reduing repeated letters * @param tokenizer the tokenizer * @param stemmer the stemmer * @param stop the stopwords handler * @return a list of tokens */ static public List<String> tokenize(String content, boolean toLowerCase, boolean standarizeUrlsUsers, boolean reduceRepeatedLetters, Tokenizer tokenizer, Stemmer stemmer, StopwordsHandler stop) { if (toLowerCase) content = content.toLowerCase(); // if a letters appears two or more times it is replaced by only two // occurrences of it if (reduceRepeatedLetters) content = content.replaceAll("([a-z])\\1+", "$1$1"); List<String> tokens = new ArrayList<String>(); tokenizer.tokenize(content); for (; tokenizer.hasMoreElements();) { String token = tokenizer.nextElement(); if (!stop.isStopword(token)) { if (standarizeUrlsUsers) { // Replace URLs to a generic URL if (token.matches("http.*|ww\\..*|www\\..*")) { token = "http://www.url.com"; } // Replaces user mentions to a generic user else if (token.matches("@.*")) { token = "@user"; } } tokens.add(stemmer.stem(token)); } } return tokens; }