Example usage for edu.stanford.nlp.ling Word toString

List of usage examples for edu.stanford.nlp.ling Word toString

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling Word toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java

License:Open Source License

/**
 * Constructor with minimum parameters. It only tokenizes a given String
 * without removing stopwords, name handles etc.
 * @param config A Config object./*w w w . jav  a 2 s .co  m*/
 * @param text The text to be tokenized.
 */
public Tokenizer(Config config, String text) {
    this.config = config;
    TokenizerFactory<Word> tf = PTBTokenizer.factory();
    List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize();
    for (Word token : tokens) {
        cleanTokens.add(token.toString());
    }
    //        String[] tokens = text.split(" ");
    //        cleanTokens.addAll(Arrays.asList(tokens));
}

From source file:edu.cmu.cs.in.hoop.hoops.transform.HoopSentence2Tokens.java

License:Open Source License

/**
 *
 *//*from   w ww  . j  a v  a 2s .com*/
public Boolean runHoop(HoopBase inHoop) {
    String result = "";
    debug("runHoop ()");

    TokenizerFactory<Word> factory = PTBTokenizerFactory.newTokenizerFactory();

    ArrayList<HoopKV> inData = inHoop.getData();

    if (inData != null) {
        HoopSimpleFeatureMaker featureMaker = new HoopSimpleFeatureMaker();

        result = "Number of sentences in input :: " + inData.size();
        for (int i = 0; i < inData.size(); i++) {
            //HoopKVInteger aKV=(HoopKVInteger) inData.get(i);
            HoopKV aKV = inData.get(i);

            HoopKV newKV = createKV(aKV);

            //debug ("Processing item: " + i + " with value: " + aKV.getValueAsString());

            //>------------------------------------------------------------------------

            if (targetTokenizer.getValue().equalsIgnoreCase("SplitOnCharacter") == true) {
                //debug ("Using builtin tokenizer ...");

                List<String> tokens = featureMaker.unigramTokenizeOnCharacter(aKV.getValueAsString(),
                        splitCharacter.getPropValue());

                //debug ("Extracted " + tokens.size());

                if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) {
                    //debug ("Generate mode is Add");

                    //HoopKVInteger newToken=new HoopKVInteger ();

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        //debug ("final input for new token: " + strippedInput);

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        //newToken.setKey (i);
                        //newToken.setValue (strippedInput, j);
                        Integer keyFormatter = i;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(strippedInput, j);
                    }

                    //addKV (newToken);
                    addKV(newKV);
                } else {
                    //debug ("Generate mode is New");

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        //debug ("final input for new token: " + strippedInput);

                        if (this.reKey.getPropValue() == false) {
                            Integer keyFormatter = j;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (j,strippedInput));
                        } else {
                            Integer keyFormatter = i;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (i,strippedInput));
                        }
                    }
                }
            }

            //>------------------------------------------------------------------------

            if (targetTokenizer.getValue().equalsIgnoreCase("RegEx") == true) {
                //debug ("Using builtin tokenizer ...");

                List<String> tokens = featureMaker.unigramTokenizeBasic(aKV.getValueAsString());

                //debug ("Extracted " + tokens.size());

                if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) {
                    //debug ("Generate mode is Add");

                    //HoopKVInteger newToken=new HoopKVInteger ();

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        //debug ("final input for new token: " + strippedInput);

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        Integer keyFormatter = i;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(strippedInput, j);
                    }

                    //addKV (newToken);
                    addKV(newKV);
                } else {
                    //debug ("Generate mode is New");

                    for (int j = 0; j < tokens.size(); j++) {
                        String aToken = tokens.get(j);

                        String strippedInput = aToken;

                        if (removePunctuation.getPropValue() == true)
                            strippedInput = aToken.replaceAll(splitRegEx.getValue(), "");

                        //debug ("final input for new token: " + strippedInput);

                        if (this.reKey.getPropValue() == false) {
                            Integer keyFormatter = j;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (j,strippedInput));
                        } else {
                            Integer keyFormatter = i;
                            newKV.setKeyString(keyFormatter.toString());
                            newKV.setValue(strippedInput);
                            addKV(newKV);
                            //addKV (new HoopKVInteger (i,strippedInput));
                        }
                    }
                }
            }

            //>------------------------------------------------------------------------

            if (targetTokenizer.getValue().equalsIgnoreCase("Stanford") == true) {
                //debug ("Using stanford tokenizer ...");

                Tokenizer<Word> tokenizer = factory.getTokenizer(new StringReader(aKV.getValueAsString()));

                List<Word> sTokens = tokenizer.tokenize();

                //debug ("Extracted " + sTokens.size());

                for (int t = 0; t < sTokens.size(); t++) {
                    Word aTerm = sTokens.get(t);

                    if (this.reKey.getPropValue() == false) {
                        Integer keyFormatter = t;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(aTerm.toString());
                        addKV(newKV);
                        //addKV (new HoopKVInteger (j,strippedInput));
                    } else {
                        Integer keyFormatter = i;
                        newKV.setKeyString(keyFormatter.toString());
                        newKV.setValue(aTerm.toString());
                        addKV(newKV);
                        //addKV (new HoopKVInteger (i,strippedInput));
                    }
                }
            }

            //>------------------------------------------------------------------------            

            updateProgressStatus(i, inData.size());
        }
    } else
        return (false);

    HoopStatisticsPanel statsPanel;
    if (HoopLink.getWindow("Statistics") != null) {
        statsPanel = (HoopStatisticsPanel) HoopLink.getWindow("Statistics");
    } else {
        statsPanel = new HoopStatisticsPanel();
    }
    HoopLink.addView("Statistics", statsPanel, HoopLink.bottom);
    statsPanel.appendString("\n" + result);
    return (true);
}