List of usage examples for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText
public static List<List<HasWord>> tokenizeText(Reader r, TokenizerFactory<? extends HasWord> tokenizerFactory)
From source file:at.illecker.storm.commons.tokenizer.Tokenizer.java
License:Apache License
public static List<String> tokenize(String str, Type type) { // Step 1) Trim text str = str.trim();//from w w w. j a v a2s. c om // Step 2) Replace Unicode symbols \u0000 if (UnicodeUtils.containsUnicode(str)) { String replacedText = UnicodeUtils.replaceUnicodeSymbols(str); // LOG.info("Replaced Unicode symbols from '" + str + "' to '" // + replacedText + "'"); if ((LOGGING) && (replacedText.equals(str))) { LOG.warn("Unicode symbols could not be replaced: '" + str + "'"); } str = replacedText; } // Step 3) Replace HTML symbols &#[0-9]; if (HtmlUtils.containsHtml(str)) { String replacedText = HtmlUtils.replaceHtmlSymbols(str); // LOG.info("Replaced HTML symbols from '" + text + "' to '" // + replacedText + "'"); if ((LOGGING) && (replacedText.equals(str))) { LOG.warn("HTML symbols could not be replaced: '" + str + "'"); } str = replacedText; } // Step 4) Tokenize List<String> tokenizedTokens = null; switch (type) { case REGEX_TOKENIZER: tokenizedTokens = new ArrayList<String>(); Matcher m = RegexUtils.TOKENIZER_PATTERN.matcher(str); while (m.find()) { tokenizedTokens.add(m.group()); } break; case ARK_TOKENIZER: tokenizedTokens = Twokenize.tokenize(str); break; case STANFORD_TOKENIZER: TokenizerFactory<Word> tokenizer = PTBTokenizerFactory.newTokenizerFactory(); tokenizer.setOptions("ptb3Escaping=false"); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(str), tokenizer); // Convert sentences to List<String> tokenizedTokens = new ArrayList<String>(); for (List<HasWord> sentence : sentences) { for (HasWord word : sentence) { tokenizedTokens.add(word.word()); } } break; default: break; } return tokenizedTokens; }