List of usage examples for edu.stanford.nlp.process PTBTokenizer getNewlineToken
public static String getNewlineToken()
From source file:tokenizer.TokenizeCorpus.java
public void tokenizerByToken(String pathIn, String pathOu) throws FileNotFoundException { //by token//from ww w. java2 s . c o m File inputTokenized = new File(pathOu); FileWriter fw = null; List<String> tk = new ArrayList<>(); try { fw = new FileWriter(inputTokenized); System.out.println("Tokenizing input corpus ..."); PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(pathIn), new CoreLabelTokenFactory(), //se metti untokenizable = noneDelete "ptb3Escaping=false,untokenizable=noneDelete,tokenizeNLs=true"); while (ptbt.hasNext()) { CoreLabel label = ptbt.next(); String labelLowCase = String.valueOf(label).toLowerCase(); String s = String.valueOf(PTBTokenizer.getNewlineToken()); if (String.valueOf(label).equals(s)) { fw.append(System.lineSeparator()); } else { //controllo dei doppi apici,'\n','\r' switch (labelLowCase.charAt(0)) { case '"': label.setValue(""); fw.append(label + " "); continue; case '\n': label.setValue(""); fw.append(label + " "); continue; case '\r': label.setValue(""); fw.append(label + " "); continue; default: fw.append(labelLowCase + " "); } } } fw.flush(); fw.close(); } catch (IOException e) { e.printStackTrace(); } }