Example usage for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText

List of usage examples for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText

Introduction

In this page you can find the example usage for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText.

Prototype

public static List<List<HasWord>> tokenizeText(Reader r, TokenizerFactory<? extends HasWord> tokenizerFactory) 

Source Link

Document

Reads data from r, tokenizes it with the given tokenizer, and returns a List of Lists of (extends) HasWord objects, which can then be fed into tagSentence.

Usage

From source file:at.illecker.storm.commons.tokenizer.Tokenizer.java

License:Apache License

public static List<String> tokenize(String str, Type type) {
    // Step 1) Trim text
    str = str.trim();//from  w w w. j  a  v  a2s.  c om

    // Step 2) Replace Unicode symbols \u0000
    if (UnicodeUtils.containsUnicode(str)) {
        String replacedText = UnicodeUtils.replaceUnicodeSymbols(str);
        // LOG.info("Replaced Unicode symbols from '" + str + "' to '"
        // + replacedText + "'");
        if ((LOGGING) && (replacedText.equals(str))) {
            LOG.warn("Unicode symbols could not be replaced: '" + str + "'");
        }
        str = replacedText;
    }

    // Step 3) Replace HTML symbols &#[0-9];
    if (HtmlUtils.containsHtml(str)) {
        String replacedText = HtmlUtils.replaceHtmlSymbols(str);
        // LOG.info("Replaced HTML symbols from '" + text + "' to '"
        // + replacedText + "'");
        if ((LOGGING) && (replacedText.equals(str))) {
            LOG.warn("HTML symbols could not be replaced: '" + str + "'");
        }
        str = replacedText;
    }

    // Step 4) Tokenize
    List<String> tokenizedTokens = null;

    switch (type) {
    case REGEX_TOKENIZER:
        tokenizedTokens = new ArrayList<String>();
        Matcher m = RegexUtils.TOKENIZER_PATTERN.matcher(str);
        while (m.find()) {
            tokenizedTokens.add(m.group());
        }
        break;

    case ARK_TOKENIZER:
        tokenizedTokens = Twokenize.tokenize(str);
        break;

    case STANFORD_TOKENIZER:
        TokenizerFactory<Word> tokenizer = PTBTokenizerFactory.newTokenizerFactory();
        tokenizer.setOptions("ptb3Escaping=false");
        List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(str), tokenizer);
        // Convert sentences to List<String>
        tokenizedTokens = new ArrayList<String>();
        for (List<HasWord> sentence : sentences) {
            for (HasWord word : sentence) {
                tokenizedTokens.add(word.word());
            }
        }
        break;

    default:
        break;
    }

    return tokenizedTokens;
}