List of usage examples for edu.stanford.nlp.ling WordLemmaTag WordLemmaTag
public WordLemmaTag(Label word)
From source file:ie.pars.bnc.preprocess.ProcessNLP.java
License:Open Source License
public static List<List<List<WordLemmaTag>>> parseBNCXMLTokenized(InputStream is) throws Exception { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(is); Element root = doc.getDocumentElement(); List<List<List<WordLemmaTag>>> file = new ArrayList(); List<List<WordLemmaTag>> para = null; NodeList sentences = root.getElementsByTagName("s"); Node currentNode = null;//from w w w . j a v a 2s . co m for (int j = 0; j < sentences.getLength(); j++) { if (currentNode == null) { // if this is the first sentence currentNode = sentences.item(j).getParentNode(); para = new ArrayList(); } else { if (currentNode != sentences.item(j).getParentNode()) { file.add(para); para = new ArrayList(); } currentNode = sentences.item(j).getParentNode(); } List<WordLemmaTag> tokens = new ArrayList<>(); for (int i = 0; i < sentences.item(j).getChildNodes().getLength(); i++) { if ("pause".equalsIgnoreCase(sentences.item(j).getChildNodes().item(i).getNodeName())) { if (!tokens.isEmpty()) { para.add(tokens); } tokens = new ArrayList<>(); } if (sentences.item(j).getChildNodes().item(i).getTextContent().trim().length() != 0) { tokens.add(new WordLemmaTag(sentences.item(j).getChildNodes().item(i).getTextContent().trim())); } } if (!tokens.isEmpty()) { para.add(tokens); } } if (para != null) { file.add(para); } return file; }