Example usage for edu.stanford.nlp.process PTBTokenizer getNewlineToken

Introduction

In this page you can find the example usage for edu.stanford.nlp.process PTBTokenizer getNewlineToken.

Prototype

public static String getNewlineToken()

Source Link

Document

Returns the string literal inserted for newlines when the -tokenizeNLs options is set.

Usage

From source file:tokenizer.TokenizeCorpus.java

public void tokenizerByToken(String pathIn, String pathOu) throws FileNotFoundException {
    //by token//from   ww w. java2 s . c o m
    File inputTokenized = new File(pathOu);
    FileWriter fw = null;
    List<String> tk = new ArrayList<>();
    try {
        fw = new FileWriter(inputTokenized);
        System.out.println("Tokenizing input corpus ...");
        PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(pathIn), new CoreLabelTokenFactory(),
                //se metti untokenizable = noneDelete
                "ptb3Escaping=false,untokenizable=noneDelete,tokenizeNLs=true");
        while (ptbt.hasNext()) {
            CoreLabel label = ptbt.next();
            String labelLowCase = String.valueOf(label).toLowerCase();
            String s = String.valueOf(PTBTokenizer.getNewlineToken());
            if (String.valueOf(label).equals(s)) {
                fw.append(System.lineSeparator());
            } else {
                //controllo dei doppi apici,'\n','\r'
                switch (labelLowCase.charAt(0)) {
                case '"':
                    label.setValue("");
                    fw.append(label + " ");
                    continue;
                case '\n':
                    label.setValue("");
                    fw.append(label + " ");
                    continue;
                case '\r':
                    label.setValue("");
                    fw.append(label + " ");
                    continue;
                default:
                    fw.append(labelLowCase + " ");
                }

            }
        }
        fw.flush();
        fw.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}