Example usage for edu.stanford.nlp.ling TaggedWord toString

List of usage examples for edu.stanford.nlp.ling TaggedWord toString

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling TaggedWord toString.

Prototype

public String toString(String divider) 

Source Link

Usage

From source file:varaha.text.StanfordPOSTagger.java

License:Apache License

public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0))
        return null;

    if (isFirst) {
        try {/*  w  ww .  java  2 s  . c  o m*/
            tagger = new MaxentTagger(_model);
        } catch (Exception e) {
            System.err.println("Exception loading language model: " + e.getMessage());
        }
        isFirst = false;
    }

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();

    Object inThing = input.get(0);
    if (inThing instanceof String) {
        StringReader textInput = new StringReader((String) inThing);

        // Convert StringReader to String via StringBuilder
        //using string builder is more efficient than concating strings together.
        StringBuilder builder = new StringBuilder();
        int charsRead = -1;
        char[] chars = new char[100];
        do {
            charsRead = textInput.read(chars, 0, chars.length);
            //if we have valid chars, append them to end of string.
            if (charsRead > 0) {
                builder.append(chars, 0, charsRead);
            }
        } while (charsRead > 0);

        // Tagging with the Stanford tagger produces another string, format: word_TAG
        String stringReadFromReader = builder.toString();
        String tagged = tagger.tagString(stringReadFromReader);
        StringReader taggedInput = new StringReader(tagged);

        //won't use tokenizer, as it splits also on ._., instead use plain white space regex
        //PTBTokenizer ptbt = new PTBTokenizer(taggedInput , new CoreLabelTokenFactory(),  "invertible=true,untokenizable=allKeep");

        // Now split based on '_' and build/return a bag of 2-field tuples
        Tuple termText = tupleFactory.newTuple();
        String[] tokens = tagged.split("\\s+");
        //for (CoreLabel label; ptbt.hasNext(); ) {
        for (String s : tokens) {
            //label = (CoreLabel)ptbt.next();
            String word = s; //label.word();
            String[] parts = word.split("_");
            List<String> token = Arrays.asList(parts);

            termText = tupleFactory.newTuple(token);
            bagOfTokens.add(termText);
        }
        //bagOfTokens.add(termText);
    } else if (inThing instanceof DataBag) {
        Iterator<Tuple> itr = ((DataBag) inThing).iterator();
        List<Word> sentence = null;
        while (itr.hasNext()) {
            Tuple t = itr.next();
            if (t.get(0) != null) {
                Word word = new Word(t.get(0).toString());
                sentence.add(word);
            }
        }
        ArrayList<TaggedWord> tagged_sentence = tagger.apply(sentence);
        for (TaggedWord tw : tagged_sentence) {
            ArrayList values = new ArrayList();
            values.add(tw.word());
            values.add(tw.toString("_"));
            Tuple t = tupleFactory.newTuple(values);
            bagOfTokens.add(t);
        }
    } else {
        throw new IOException();
    }
    return bagOfTokens;
}