List of usage examples for edu.stanford.nlp.process PTBTokenizer newPTBTokenizer
public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible)
From source file:org.exist.xquery.corenlp.Tokenize.java
License:Open Source License
private void tokenizeString(String text, final OutDocType outputFormat) { PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(new StringReader(text), tokenizeNLs, true); cachedTokenizer = tokenizer;/*ww w . j av a 2 s . co m*/ List<CoreLabel> tokens = tokenizer.tokenize(); List<List<CoreLabel>> sentences = new WordToSentenceProcessor( WordToSentenceProcessor.NewlineIsSentenceBreak.TWO_CONSECUTIVE).wordsToSentences(tokens); createSpreadsheet(sentences, tokens, outputFormat); }
From source file:tmadvanced.CollectTokens.java
License:Apache License
/** * This function tokenize using Stanford tokenizer * @param sentence/*w w w.ja v a 2 s . c om*/ * @return */ private String StanTokenizer(String sentence) { Reader r = new BufferedReader(new StringReader(sentence.trim())); PTBTokenizer ptbt = PTBTokenizer.newPTBTokenizer(r, false, false); String stok = ""; for (CoreLabel label; ptbt.hasNext();) { label = (CoreLabel) ptbt.next(); stok = stok + " " + label; } return stok; }
From source file:tmadvanced.tokenization.StanfordTokenizer.java
License:Apache License
public StanfordTokenizer(String filetotokenize, String out) { // Reader r= new BufferedReader(new StringReader(filetotokenize)); try {//from w w w.jav a 2 s . c o m FileWriter fw = new FileWriter(out); Scanner sc = new Scanner(new File(filetotokenize)); // Reader r= new FileReader(new File(filetotokenize)); int lcount = 0; while (sc.hasNextLine()) { Reader r = new BufferedReader(new StringReader(sc.nextLine().trim())); PTBTokenizer ptbt = PTBTokenizer.newPTBTokenizer(r, false, false); if (lcount % 10000 == 0) System.err.println(lcount); for (CoreLabel label; ptbt.hasNext();) { label = (CoreLabel) ptbt.next(); fw.write(label.word() + " "); //System.out.print(label.word()); // if(label.word().equals("*NL*")){ //System.out.println(); // }else{ // fw.write(label.word()+" "); //System.out.print(words.get(i)+" "); // } } lcount++; fw.write('\n'); } fw.close(); System.err.println("Total Lines tokenized:" + lcount); } catch (IOException e) { e.printStackTrace(); System.exit(1); } }