List of usage examples for org.dom4j.io SAXReader setEntityResolver
public void setEntityResolver(EntityResolver entityResolver)
From source file:de.jwic.base.XmlApplicationSetup.java
License:Apache License
/** * @param source/*from w w w . j a va2 s. com*/ * @throws IOException */ public XmlApplicationSetup(InputSource source) { try { SAXReader reader = new SAXReader(); reader.setEntityResolver(new DTDEntityResolver(PUBLICID, SYSTEMID, DTD_RESOURCEPATH)); reader.setIncludeExternalDTDDeclarations(false); Document document = reader.read(source); readDocument(document); } catch (Exception e1) { throw new RuntimeException("Error reading applicationSetup: " + e1, e1); } }
From source file:de.tudarmstadt.ukp.dkpro.wsd.io.reader.MASCReader.java
License:Apache License
@SuppressWarnings("unchecked") @Override//from w w w.java 2 s. co m public void getNext(JCas jCas) throws IOException, CollectionException { if (nodeIterator == null || nodeIterator.hasNext() == false) { Resource nextFile = nextFile(); String fileLocation = nextFile.getLocation(); int offset = fileLocation.indexOf(":") + 1; // ignore the initial // "file:" or // "classpath:" prefix // of the location int extLength = 7; // ignore the extension "-wn.xml" String baseName = fileLocation.substring(offset, fileLocation.length() - extLength); logger.info("Processing " + fileLocation); // get the target word String targetItem = baseName.substring(baseName.lastIndexOf("/") + 1); setPosLemma(targetItem); String txtFileLocation = baseName + ".txt"; logger.info("Raw corpus file: " + txtFileLocation); // open the txt file containing the raw corpus for this targetItem File txtFile = new File(txtFileLocation); String corpus; Scanner scanner; try { scanner = new Scanner(txtFile); } catch (FileNotFoundException fne) { // txt file is on the classpath InputStream is = this.getClass().getResourceAsStream(txtFileLocation); if (is == null) { System.out.println("File " + txtFileLocation + " not found! Skipping..."); return; } scanner = new Scanner(is); } try { scanner.useDelimiter("\\Z"); corpus = scanner.next(); scanner.close(); } catch (NoSuchElementException e) { System.out.println("File " + txtFileLocation + " might be empty! Skipping..."); return; } // open the file containing WordNet annotations SAXReader reader = new SAXReader(); NullEntityResolver resolver = new NullEntityResolver(); reader.setEntityResolver(resolver); InputStream is = new BufferedInputStream(nextFile.getInputStream()); try { document = reader.read(is); } catch (DocumentException e) { throw new CollectionException(e); } root = document.getRootElement(); if (root.getName().equals(ELEMENT_GRAPH) == false) { throw new CollectionException("unknown_element", new Object[] { root.getName() }); } Iterator<Element> regionIterator = root.elementIterator(ELEMENT_REGION); if (regionIterator.hasNext() == false) { throw new CollectionException("element_not_found", new Object[] { ELEMENT_REGION, root }); } targetItems2sentences = mapTargetItems2sentences(regionIterator, corpus); nodeIterator = root.elementIterator(ELEMENT_NODE); if (nodeIterator.hasNext() == false) { throw new CollectionException("element_not_found", new Object[] { ELEMENT_NODE, root }); } } Element node = nodeIterator.next(); String documentText = processNode(jCas, node, root); // if no tie between annotators is discovered if (documentText != null) { setDocumentMetadata(jCas, node); jCas.setDocumentText(documentText); } else { jCas.reset(); } // after all files are processed, output the annotation stats if oANN // specified if (!hasNext() && !outputAnnotation.equals("")) { logger.info("Outputting annotation info to " + outputAnnotation); outputAnnotations(); } }
From source file:de.tudarmstadt.ukp.dkpro.wsd.io.reader.SemCorXMLReader.java
License:Apache License
@Override @SuppressWarnings("unchecked") public void getNext(JCas jCas) throws IOException, CollectionException { mappingProvider.configure(jCas.getCas()); // Open the next file Document document;//from w ww.jav a 2 s . co m SAXReader reader = new SAXReader(); NullEntityResolver resolver = new NullEntityResolver(); reader.setEntityResolver(resolver); InputStream is = new BufferedInputStream(nextFile().getInputStream()); try { document = reader.read(is); } catch (DocumentException e) { throw new CollectionException(e); } // Get metadata from the top two elements Element contextFile = document.getRootElement(); if (contextFile.getName().equals(ELEMENT_CONTEXTFILE) == false) { throw new CollectionException("unknown_element", new Object[] { contextFile.getName() }); } Iterator<Element> contextIterator = contextFile.elementIterator(ELEMENT_CONTEXT); if (contextIterator.hasNext() == false) { throw new CollectionException("element_not_found", new Object[] { ELEMENT_CONTEXT, ELEMENT_CONTEXTFILE }); } Element context = contextIterator.next(); setDocumentMetadata(jCas, contextFile, context); String documentId = context.attributeValue(ATTR_FILENAME); logger.debug("Found context filename: " + documentId); // Process document text StringBuffer documentText = processParagraphs(jCas, context, documentId); if (documentText.length() == 0) { documentText = processSentences(jCas, context, 0, documentId); } jCas.setDocumentText(documentText.toString()); logger.info("Read " + validWordFormCount + " valid word forms; skipped " + (totalWordFormCount - validWordFormCount)); }
From source file:de.tudarmstadt.ukp.dkpro.wsd.io.reader.WebCAGeXMLReader.java
License:Apache License
@Override @SuppressWarnings("unchecked") public void getNext(JCas jCas) throws IOException, CollectionException { if (textIterator == null || textIterator.hasNext() == false) { // Open the next file Document document;// www . j av a2s .co m SAXReader reader = new SAXReader(); NullEntityResolver resolver = new NullEntityResolver(); reader.setEntityResolver(resolver); Resource nextFile = nextFile(); logger.info("Reading " + nextFile.getLocation()); InputStream is = new BufferedInputStream(nextFile.getInputStream()); try { document = reader.read(is); } catch (DocumentException e) { throw new CollectionException(e); } // Get metadata from the top two elements corpus = document.getRootElement(); if (corpus.getName().equals(ELEMENT_CORPUS) == false) { throw new CollectionException("unknown_element", new Object[] { corpus.getName() }); } textIterator = corpus.elementIterator(ELEMENT_TEXT); if (textIterator.hasNext() == false) { throw new CollectionException("element_not_found", new Object[] { ELEMENT_TEXT, ELEMENT_CORPUS }); } } Element text = textIterator.next(); setDocumentMetadata(jCas, corpus, text); // Process document text StringBuffer documentText = processText(jCas, text); jCas.setDocumentText(documentText.toString()); }
From source file:de.tudarmstadt.ukp.dkpro.wsd.senseval.reader.SensevalReader.java
License:Apache License
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); textCount = 0;//from w ww. j av a 2s. c o m // EntityResolver resolver = new EntityResolver() { // public InputSource resolveEntity(String publicId, String systemId) { // try { // URL url; // if (publicId == null) { // url = ResourceUtils.resolveLocation(systemId, this, null); // } // else { // url = ResourceUtils.resolveLocation(publicId, this, null); // } // return new InputSource(url.openStream()); // } catch (IOException e) { // e.printStackTrace(); // return null; // } // } // }; Document documentCollection = null; SAXReader reader = new SAXReader(); // TODO: We can't figure out how to get the XML parser to read DTDs in // all cases (i.e., whether they are in a directory or in a JAR) so the // following code just forces the SAXReader to ignore DTDs. This is // not an optimal solution as it prevents the XML files from being // validated. EntityResolver resolver = new EntityResolver() { @Override public InputSource resolveEntity(String publicId, String systemId) { return new InputSource(new StringReader("")); } }; reader.setEntityResolver(resolver); InputStream is = null; try { fileURL = ResourceUtils.resolveLocation(fileName, this, context); is = fileURL.openStream(); // The following line fails on Jenkins but not locally // documentCollection = reader.read(fileURL.getFile()); documentCollection = reader.read(is); } catch (DocumentException e) { throw new ResourceInitializationException(e); } catch (IOException e) { throw new ResourceInitializationException(e); } finally { IOUtils.closeQuietly(is); } // Get the (root) corpus element so we can iterate over its elements corpus = documentCollection.getRootElement(); if (corpus.getName().equals(CORPUS_ELEMENT_NAME) == false) { throw new ResourceInitializationException("unknown_element", new Object[] { corpus.getName() }); } }
From source file:de.tudarmstadt.ukp.lmf.transform.XMLToDBTransformer.java
License:Apache License
/** * Read xml File and save its contents to Database * @param xmlFile//w ww . j a v a 2s.c o m * @param lexicalResourceName * @throws DocumentException * @throws UbyInvalidArgumentException */ public void transform(File xmlFile, String lexicalResourceName) throws DocumentException, IllegalArgumentException { long startTime = System.currentTimeMillis(); openSession(); if (lexicalResourceName != null) { lexicalResource = (LexicalResource) session.get(LexicalResource.class, lexicalResourceName); } SAXReader reader = new SAXReader(false); reader.setEntityResolver(new EntityResolver() { @Override public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { if (systemId.endsWith(".dtd")) { return new InputSource(new StringReader("")); } return null; } }); reader.setDefaultHandler(this); reader.read(xmlFile); commit(); closeSession(); System.out.println("TOTAL TIME: " + (System.currentTimeMillis() - startTime)); System.out.println("NUM ENTRIES: " + commitCounter); }
From source file:dkpro.similarity.uima.io.RTECorpusReader.java
License:Apache License
@Override public List<CombinationPair> getAlignedPairs() throws ResourceInitializationException { List<CombinationPair> pairs = new ArrayList<CombinationPair>(); SAXReader reader = null; InputStream is = null;/* w w w .j a v a 2s.c o m*/ URL url; try { reader = new SAXReader(false); // Disable DTD resolution (which fails due to relative path to DTD file) NullEntityResolver resolver = new NullEntityResolver(); reader.setEntityResolver(resolver); url = ResourceUtils.resolveLocation(inputFile, this, this.getUimaContext()); Document document = reader.read(new BufferedInputStream(url.openStream())); Element root = document.getRootElement(); final XPath pairXPath = new Dom4jXPath("//pair"); int i = 0; for (Object element : pairXPath.selectNodes(root)) { i++; String text1 = ""; String text2 = ""; String entailmentOutcome = ""; if (element instanceof Element) { Element node = (Element) element; String tXPath = "child::t"; for (Object tElement : new Dom4jXPath(tXPath).selectNodes(node)) { if (tElement instanceof Element) { text1 = ((Element) tElement).getText(); } } String hXPath = "child::h"; for (Object hElement : new Dom4jXPath(hXPath).selectNodes(node)) { if (hElement instanceof Element) { text2 = ((Element) hElement).getText(); } } // print out entailment value for use as gold standard for (Object o : node.attributes()) { Attribute attribute = (Attribute) o; String name = attribute.getName().toLowerCase(); if (name.equals("value") || name.equals("entailment")) { entailmentOutcome = attribute.getValue(); System.out.println(i + ":" + entailmentOutcome); } } } EntailmentPair pair = new EntailmentPair(url.toString()); pair.setID1("t1-" + i); pair.setID2("t2-" + i); pair.setText1(text1); pair.setText2(text2); pair.setEntailmentOutcome(entailmentOutcome); pairs.add(pair); } } catch (JaxenException e) { throw new ResourceInitializationException(e); } catch (DocumentException e) { throw new ResourceInitializationException(e); } catch (IOException e) { throw new ResourceInitializationException(e); } finally { IOUtils.closeQuietly(is); } return pairs; }
From source file:edu.ku.brc.specify.tools.AppendHelp.java
License:Open Source License
/** * Reads a DOM from a stream//from w ww . j a va 2 s .c o m * @param fileinputStream the stream to be read * @return the root element of the DOM */ public Element readFileToDOM4J(final File file) throws IOException, DocumentException { SAXReader saxReader = new SAXReader(); try { saxReader.setValidation(false); saxReader.setStripWhitespaceText(true); //saxReader.setIncludeExternalDTDDeclarations(false); //saxReader.setIncludeInternalDTDDeclarations(false); saxReader.setIgnoreComments(true); //saxReader.setXMLFilter(new TransparentFilter(saxReader.getXMLReader())); EntityResolver entityResolver = new EntityResolver() { public InputSource resolveEntity(String publicId, String systemId) { return new InputSource(""); } }; saxReader.setEntityResolver(entityResolver); //saxReader.getXMLFilter().setDTDHandler(null); } catch (Exception ex) { ex.printStackTrace(); } org.dom4j.Document document = saxReader.read(new FileInputStream(file)); return document.getRootElement(); }
From source file:es.caib.seycon.ng.servei.PuntEntradaServiceImpl.java
protected String handleValidaXMLPUE(PuntEntrada puntEntrada) throws Exception { String contingut = puntEntrada.getXmlPUE(); if (!"".equals(contingut)) { //$NON-NLS-1$ try {/*from w w w . j a v a 2 s . c o m*/ // Validem el document // new // es.caib.seycon.mazinger.compiler.Compile().parse(contingut); org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader(true); // set the validation feature to true to report validation // errors reader.setFeature("http://xml.org/sax/features/validation", true); //$NON-NLS-1$ // set the validation/schema feature to true to report // validation errors // against a schema reader.setFeature("http://apache.org/xml/features/validation/schema", true); //$NON-NLS-1$ // set the validation/schema-full-checking feature to true to // enable // full schema, grammar-constraint checking reader.setFeature("http://apache.org/xml/features/validation/schema-full-checking", //$NON-NLS-1$ true); // set the schema reader.setProperty("http://apache.org/xml/properties/schema/external-noNamespaceSchemaLocation", //$NON-NLS-1$ "/es/caib/seycon/mazinger/Mazinger.xsd"); //$NON-NLS-1$ // set the entity resolver (to load the schema with // getResourceAsStream) reader.getXMLReader().setEntityResolver(new SchemaLoader()); reader.setEntityResolver(new SchemaLoader()); Document doc = reader.read(new ByteArrayInputStream(contingut.getBytes("UTF-8"))); //$NON-NLS-1$ } catch (Exception ex) { return ex.getMessage(); // Retornem l'excepci com error de // Validaci } } return ""; //$NON-NLS-1$ }
From source file:eu.scape_project.planning.services.taverna.parser.T2FlowParser.java
License:Apache License
/** * Initialises the parser by reading the the t2flow from the input stream * and parsing it./*from ww w.ja v a2s.c o m*/ * * @param t2flow * the t2flow * @throws TavernaParserException * if the parser could not be initialized */ protected void initialise(InputStream t2flow) throws TavernaParserException { log.debug("Parsing inputstream"); T2FLOW_NAMESPACE_MAP.put("t2f", "http://taverna.sf.net/2008/xml/t2flow"); ValidatingParserFactory vpf = new ValidatingParserFactory(); try { SAXParser parser = vpf.getValidatingParser(); parser.setProperty(ValidatingParserFactory.JAXP_SCHEMA_SOURCE, ProjectImporter.TAVERNA_SCHEMA_URI); SAXReader reader = new SAXReader(parser.getXMLReader()); reader.setValidation(false); SchemaResolver schemaResolver = new SchemaResolver(); schemaResolver.addSchemaLocation(ProjectImporter.TAVERNA_SCHEMA_URI, SCHEMA_LOCATION + ProjectImporter.TAVERNA_SCHEMA); reader.setEntityResolver(schemaResolver); doc = reader.read(t2flow); } catch (DocumentException e) { log.error("Error initialising T2FlowParser: {}", e.getMessage()); throw new TavernaParserException("Error parsing workflow.", e); } catch (ParserConfigurationException e) { log.error("Error initialising T2FlowParser: {}", e.getMessage()); throw new TavernaParserException("Error parsing workflow.", e); } catch (SAXException e) { log.error("Error initialising T2FlowParser: {}", e.getMessage()); throw new TavernaParserException("Error parsing workflow.", e); } }