List of usage examples for edu.stanford.nlp.ling Word toString
@Override
public String toString()
From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java
License:Open Source License
/** * Constructor with minimum parameters. It only tokenizes a given String * without removing stopwords, name handles etc. * @param config A Config object./*w w w . jav a 2 s .co m*/ * @param text The text to be tokenized. */ public Tokenizer(Config config, String text) { this.config = config; TokenizerFactory<Word> tf = PTBTokenizer.factory(); List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize(); for (Word token : tokens) { cleanTokens.add(token.toString()); } // String[] tokens = text.split(" "); // cleanTokens.addAll(Arrays.asList(tokens)); }
From source file:edu.cmu.cs.in.hoop.hoops.transform.HoopSentence2Tokens.java
License:Open Source License
/** * *//*from w ww . j a v a 2s .com*/ public Boolean runHoop(HoopBase inHoop) { String result = ""; debug("runHoop ()"); TokenizerFactory<Word> factory = PTBTokenizerFactory.newTokenizerFactory(); ArrayList<HoopKV> inData = inHoop.getData(); if (inData != null) { HoopSimpleFeatureMaker featureMaker = new HoopSimpleFeatureMaker(); result = "Number of sentences in input :: " + inData.size(); for (int i = 0; i < inData.size(); i++) { //HoopKVInteger aKV=(HoopKVInteger) inData.get(i); HoopKV aKV = inData.get(i); HoopKV newKV = createKV(aKV); //debug ("Processing item: " + i + " with value: " + aKV.getValueAsString()); //>------------------------------------------------------------------------ if (targetTokenizer.getValue().equalsIgnoreCase("SplitOnCharacter") == true) { //debug ("Using builtin tokenizer ..."); List<String> tokens = featureMaker.unigramTokenizeOnCharacter(aKV.getValueAsString(), splitCharacter.getPropValue()); //debug ("Extracted " + tokens.size()); if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) { //debug ("Generate mode is Add"); //HoopKVInteger newToken=new HoopKVInteger (); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; //debug ("final input for new token: " + strippedInput); if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); //newToken.setKey (i); //newToken.setValue (strippedInput, j); Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput, j); } //addKV (newToken); addKV(newKV); } else { //debug ("Generate mode is New"); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); //debug ("final input for new token: " + strippedInput); if (this.reKey.getPropValue() == false) { Integer keyFormatter = j; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (j,strippedInput)); } else { Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (i,strippedInput)); } } } } //>------------------------------------------------------------------------ if (targetTokenizer.getValue().equalsIgnoreCase("RegEx") == true) { //debug ("Using builtin tokenizer ..."); List<String> tokens = featureMaker.unigramTokenizeBasic(aKV.getValueAsString()); //debug ("Extracted " + tokens.size()); if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) { //debug ("Generate mode is Add"); //HoopKVInteger newToken=new HoopKVInteger (); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; //debug ("final input for new token: " + strippedInput); if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput, j); } //addKV (newToken); addKV(newKV); } else { //debug ("Generate mode is New"); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); //debug ("final input for new token: " + strippedInput); if (this.reKey.getPropValue() == false) { Integer keyFormatter = j; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (j,strippedInput)); } else { Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (i,strippedInput)); } } } } //>------------------------------------------------------------------------ if (targetTokenizer.getValue().equalsIgnoreCase("Stanford") == true) { //debug ("Using stanford tokenizer ..."); Tokenizer<Word> tokenizer = factory.getTokenizer(new StringReader(aKV.getValueAsString())); List<Word> sTokens = tokenizer.tokenize(); //debug ("Extracted " + sTokens.size()); for (int t = 0; t < sTokens.size(); t++) { Word aTerm = sTokens.get(t); if (this.reKey.getPropValue() == false) { Integer keyFormatter = t; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(aTerm.toString()); addKV(newKV); //addKV (new HoopKVInteger (j,strippedInput)); } else { Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(aTerm.toString()); addKV(newKV); //addKV (new HoopKVInteger (i,strippedInput)); } } } //>------------------------------------------------------------------------ updateProgressStatus(i, inData.size()); } } else return (false); HoopStatisticsPanel statsPanel; if (HoopLink.getWindow("Statistics") != null) { statsPanel = (HoopStatisticsPanel) HoopLink.getWindow("Statistics"); } else { statsPanel = new HoopStatisticsPanel(); } HoopLink.addView("Statistics", statsPanel, HoopLink.bottom); statsPanel.appendString("\n" + result); return (true); }