List of usage examples for edu.stanford.nlp.process PTBTokenizer factory
public static TokenizerFactory<Word> factory()
From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java
License:Open Source License
/** * Constructor with minimum parameters. It only tokenizes a given String * without removing stopwords, name handles etc. * @param config A Config object./*from w ww . j a v a 2 s .c o m*/ * @param text The text to be tokenized. */ public Tokenizer(Config config, String text) { this.config = config; TokenizerFactory<Word> tf = PTBTokenizer.factory(); List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize(); for (Word token : tokens) { cleanTokens.add(token.toString()); } // String[] tokens = text.split(" "); // cleanTokens.addAll(Arrays.asList(tokens)); }
From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java
License:Open Source License
/** * Public constructor. It tokenizes a given String and separates hashtags, * name handles, URLs and stopwords and stores them into different lists. * @param config A Config object.//from w ww.j av a2 s.co m * @param text The text to be tokenized. * @param sw A StopWords handle. */ public Tokenizer(Config config, String text, StopWords sw) { TokenizerFactory<Word> tf = PTBTokenizer.factory(); List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize(); this.config = config; numberOfTokens = tokens.size(); tokens.stream().map((word) -> word.toString()).forEach((token) -> { if (isHashtag(token)) { hashtags.add(token); cleanTokensAndHashtags.add(token.replace("#", "")); //Remove '#' } else if (isNameHandle(token)) { nameHandles.add(token.replace("@", "")); //Remove '@' character } else if (isURL(token)) { urls.add(token); } else if (sw.isStopWord(token)) { //Common stopwords stopWords.add(token); } else if (isCommonSymbol(token)) { //Common symbolsAndNonPrintableChars not caught before symbolsAndNonPrintableChars.add(token); } else if (sw .isNonPrintableCharacter("\\u" + Integer.toHexString(token.toCharArray()[0]).substring(1))) { //Non printable characters symbolsAndNonPrintableChars.add(token); } else { cleanTokens.add(token); cleanTokensAndHashtags.add(token); } }); }