Example usage for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText

Introduction

In this page you can find the example usage for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText.

Prototype

public static List<List<HasWord>> tokenizeText(Reader r, TokenizerFactory<? extends HasWord> tokenizerFactory)

Source Link

Document

Reads data from r, tokenizes it with the given tokenizer, and returns a List of Lists of (extends) HasWord objects, which can then be fed into tagSentence.

Usage

From source file:at.illecker.storm.commons.tokenizer.Tokenizer.java

License:Apache License

public static List<String> tokenize(String str, Type type) {
    // Step 1) Trim text
    str = str.trim();//from  w w w. j  a  v  a2s.  c om

    // Step 2) Replace Unicode symbols \u0000
    if (UnicodeUtils.containsUnicode(str)) {
        String replacedText = UnicodeUtils.replaceUnicodeSymbols(str);
        // LOG.info("Replaced Unicode symbols from '" + str + "' to '"
        // + replacedText + "'");
        if ((LOGGING) && (replacedText.equals(str))) {
            LOG.warn("Unicode symbols could not be replaced: '" + str + "'");
        }
        str = replacedText;
    }

    // Step 3) Replace HTML symbols &#[0-9];
    if (HtmlUtils.containsHtml(str)) {
        String replacedText = HtmlUtils.replaceHtmlSymbols(str);
        // LOG.info("Replaced HTML symbols from '" + text + "' to '"
        // + replacedText + "'");
        if ((LOGGING) && (replacedText.equals(str))) {
            LOG.warn("HTML symbols could not be replaced: '" + str + "'");
        }
        str = replacedText;
    }

    // Step 4) Tokenize
    List<String> tokenizedTokens = null;

    switch (type) {
    case REGEX_TOKENIZER:
        tokenizedTokens = new ArrayList<String>();
        Matcher m = RegexUtils.TOKENIZER_PATTERN.matcher(str);
        while (m.find()) {
            tokenizedTokens.add(m.group());
        }
        break;

    case ARK_TOKENIZER:
        tokenizedTokens = Twokenize.tokenize(str);
        break;

    case STANFORD_TOKENIZER:
        TokenizerFactory<Word> tokenizer = PTBTokenizerFactory.newTokenizerFactory();
        tokenizer.setOptions("ptb3Escaping=false");
        List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(str), tokenizer);
        // Convert sentences to List<String>
        tokenizedTokens = new ArrayList<String>();
        for (List<HasWord> sentence : sentences) {
            for (HasWord word : sentence) {
                tokenizedTokens.add(word.word());
            }
        }
        break;

    default:
        break;
    }

    return tokenizedTokens;
}