Example usage for edu.stanford.nlp.process PTBTokenizer factory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer factory.

Prototype

public static TokenizerFactory<Word> factory()

Source Link

Document

This is a historical constructor that returns Word tokens.

Usage

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Constructor with minimum parameters. It only tokenizes a given String
 * without removing stopwords, name handles etc.
 * @param config A Config object./*from w ww .  j  a  v a 2 s  .c o  m*/
 * @param text The text to be tokenized.
 */
public Tokenizer(Config config, String text) {
    this.config = config;
    TokenizerFactory<Word> tf = PTBTokenizer.factory();
    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    for (Word token : tokens) {
        cleanTokens.add(token.toString());
    }
    //        String[] tokens = text.split(" ");
    //        cleanTokens.addAll(Arrays.asList(tokens));
}

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Public constructor. It tokenizes a given String and separates hashtags,
 * name handles, URLs and stopwords and stores them into different lists.
 * @param config A Config object.//from w  ww.j av a2 s.co  m
 * @param text The text to be tokenized.
 * @param sw A StopWords handle.
 */
public Tokenizer(Config config, String text, StopWords sw) {
    TokenizerFactory<Word> tf = PTBTokenizer.factory();

    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    this.config = config;
    numberOfTokens = tokens.size();
    tokens.stream().map((word) -> word.toString()).forEach((token) -> {

        if (isHashtag(token)) {
            hashtags.add(token);
            cleanTokensAndHashtags.add(token.replace("#", "")); //Remove '#'
        } else if (isNameHandle(token)) {
            nameHandles.add(token.replace("@", "")); //Remove '@' character
        } else if (isURL(token)) {
            urls.add(token);
        } else if (sw.isStopWord(token)) { //Common stopwords
            stopWords.add(token);
        } else if (isCommonSymbol(token)) { //Common symbolsAndNonPrintableChars not caught before
            symbolsAndNonPrintableChars.add(token);
        } else if (sw
                .isNonPrintableCharacter("\\u" + Integer.toHexString(token.toCharArray()[0]).substring(1))) { //Non printable characters
            symbolsAndNonPrintableChars.add(token);
        } else {
            cleanTokens.add(token);
            cleanTokensAndHashtags.add(token);
        }
    });
}