List of usage examples for edu.stanford.nlp.ling TaggedWord toString
public String toString(String divider)
From source file:varaha.text.StanfordPOSTagger.java
License:Apache License
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; if (isFirst) { try {/* w ww . java 2 s . c o m*/ tagger = new MaxentTagger(_model); } catch (Exception e) { System.err.println("Exception loading language model: " + e.getMessage()); } isFirst = false; } // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); Object inThing = input.get(0); if (inThing instanceof String) { StringReader textInput = new StringReader((String) inThing); // Convert StringReader to String via StringBuilder //using string builder is more efficient than concating strings together. StringBuilder builder = new StringBuilder(); int charsRead = -1; char[] chars = new char[100]; do { charsRead = textInput.read(chars, 0, chars.length); //if we have valid chars, append them to end of string. if (charsRead > 0) { builder.append(chars, 0, charsRead); } } while (charsRead > 0); // Tagging with the Stanford tagger produces another string, format: word_TAG String stringReadFromReader = builder.toString(); String tagged = tagger.tagString(stringReadFromReader); StringReader taggedInput = new StringReader(tagged); //won't use tokenizer, as it splits also on ._., instead use plain white space regex //PTBTokenizer ptbt = new PTBTokenizer(taggedInput , new CoreLabelTokenFactory(), "invertible=true,untokenizable=allKeep"); // Now split based on '_' and build/return a bag of 2-field tuples Tuple termText = tupleFactory.newTuple(); String[] tokens = tagged.split("\\s+"); //for (CoreLabel label; ptbt.hasNext(); ) { for (String s : tokens) { //label = (CoreLabel)ptbt.next(); String word = s; //label.word(); String[] parts = word.split("_"); List<String> token = Arrays.asList(parts); termText = tupleFactory.newTuple(token); bagOfTokens.add(termText); } //bagOfTokens.add(termText); } else if (inThing instanceof DataBag) { Iterator<Tuple> itr = ((DataBag) inThing).iterator(); List<Word> sentence = null; while (itr.hasNext()) { Tuple t = itr.next(); if (t.get(0) != null) { Word word = new Word(t.get(0).toString()); sentence.add(word); } } ArrayList<TaggedWord> tagged_sentence = tagger.apply(sentence); for (TaggedWord tw : tagged_sentence) { ArrayList values = new ArrayList(); values.add(tw.word()); values.add(tw.toString("_")); Tuple t = tupleFactory.newTuple(values); bagOfTokens.add(t); } } else { throw new IOException(); } return bagOfTokens; }