Example usage for edu.stanford.nlp.process PTBTokenizer newPTBTokenizer

List of usage examples for edu.stanford.nlp.process PTBTokenizer newPTBTokenizer

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer newPTBTokenizer.

Prototype

public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) 

Source Link

Document

Constructs a new PTBTokenizer that makes CoreLabel tokens.

Usage

From source file:org.exist.xquery.corenlp.Tokenize.java

License:Open Source License

private void tokenizeString(String text, final OutDocType outputFormat) {
    PTBTokenizer<CoreLabel> tokenizer = PTBTokenizer.newPTBTokenizer(new StringReader(text), tokenizeNLs, true);
    cachedTokenizer = tokenizer;/*ww  w  .  j  av a  2 s .  co m*/
    List<CoreLabel> tokens = tokenizer.tokenize();
    List<List<CoreLabel>> sentences = new WordToSentenceProcessor(
            WordToSentenceProcessor.NewlineIsSentenceBreak.TWO_CONSECUTIVE).wordsToSentences(tokens);
    createSpreadsheet(sentences, tokens, outputFormat);
}

From source file:tmadvanced.CollectTokens.java

License:Apache License

/**
 * This function tokenize using Stanford tokenizer
 * @param sentence/*w w  w.ja  v  a  2 s . c om*/
 * @return 
 */
private String StanTokenizer(String sentence) {
    Reader r = new BufferedReader(new StringReader(sentence.trim()));
    PTBTokenizer ptbt = PTBTokenizer.newPTBTokenizer(r, false, false);
    String stok = "";
    for (CoreLabel label; ptbt.hasNext();) {
        label = (CoreLabel) ptbt.next();
        stok = stok + " " + label;
    }
    return stok;
}

From source file:tmadvanced.tokenization.StanfordTokenizer.java

License:Apache License

public StanfordTokenizer(String filetotokenize, String out) {
    // Reader r= new  BufferedReader(new StringReader(filetotokenize));
    try {//from   w  w w.jav  a  2  s .  c o m

        FileWriter fw = new FileWriter(out);
        Scanner sc = new Scanner(new File(filetotokenize));
        // Reader r= new  FileReader(new File(filetotokenize));
        int lcount = 0;
        while (sc.hasNextLine()) {
            Reader r = new BufferedReader(new StringReader(sc.nextLine().trim()));
            PTBTokenizer ptbt = PTBTokenizer.newPTBTokenizer(r, false, false);

            if (lcount % 10000 == 0)
                System.err.println(lcount);
            for (CoreLabel label; ptbt.hasNext();) {
                label = (CoreLabel) ptbt.next();
                fw.write(label.word() + " ");
                //System.out.print(label.word());
                //  if(label.word().equals("*NL*")){
                //System.out.println();

                //  }else{
                //      fw.write(label.word()+" ");
                //System.out.print(words.get(i)+" ");
                //   }
            }
            lcount++;
            fw.write('\n');
        }
        fw.close();
        System.err.println("Total Lines tokenized:" + lcount);
    } catch (IOException e) {
        e.printStackTrace();
        System.exit(1);
    }
}