Example usage for edu.stanford.nlp.process PTBTokenizer factory

List of usage examples for edu.stanford.nlp.process PTBTokenizer factory

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer factory.

Prototype

public static TokenizerFactory<Word> factory() 

Source Link

Document

This is a historical constructor that returns Word tokens.

Usage

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Constructor with minimum parameters. It only tokenizes a given String
 * without removing stopwords, name handles etc.
 * @param config A Config object./*from w ww .  j  a  v a 2 s  .c o  m*/
 * @param text The text to be tokenized.
 */
public Tokenizer(Config config, String text) {
    this.config = config;
    TokenizerFactory<Word> tf = PTBTokenizer.factory();
    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    for (Word token : tokens) {
        cleanTokens.add(token.toString());
    }
    //        String[] tokens = text.split(" ");
    //        cleanTokens.addAll(Arrays.asList(tokens));
}

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Public constructor. It tokenizes a given String and separates hashtags,
 * name handles, URLs and stopwords and stores them into different lists.
 * @param config A Config object.//from w  ww.j av a2 s.co  m
 * @param text The text to be tokenized.
 * @param sw A StopWords handle.
 */
public Tokenizer(Config config, String text, StopWords sw) {
    TokenizerFactory<Word> tf = PTBTokenizer.factory();

    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    this.config = config;
    numberOfTokens = tokens.size();
    tokens.stream().map((word) -> word.toString()).forEach((token) -> {

        if (isHashtag(token)) {
            hashtags.add(token);
            cleanTokensAndHashtags.add(token.replace("#", "")); //Remove '#'
        } else if (isNameHandle(token)) {
            nameHandles.add(token.replace("@", "")); //Remove '@' character
        } else if (isURL(token)) {
            urls.add(token);
        } else if (sw.isStopWord(token)) { //Common stopwords
            stopWords.add(token);
        } else if (isCommonSymbol(token)) { //Common symbolsAndNonPrintableChars not caught before
            symbolsAndNonPrintableChars.add(token);
        } else if (sw
                .isNonPrintableCharacter("\\u" + Integer.toHexString(token.toCharArray()[0]).substring(1))) { //Non printable characters
            symbolsAndNonPrintableChars.add(token);
        } else {
            cleanTokens.add(token);
            cleanTokensAndHashtags.add(token);
        }
    });
}