List of usage examples for org.xml.sax InputSource setSystemId
public void setSystemId(String systemId)
From source file:de.tudarmstadt.ukp.dkpro.core.io.ancora.AncoraReader.java
@Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile();//from w w w. j a va2 s . c o m initCas(aJCas, res); // Set up language if (getLanguage() != null) { aJCas.setDocumentLanguage(getLanguage()); } // Configure mapping only now, because now the language is set in the CAS try { posMappingProvider.configure(aJCas.getCas()); } catch (AnalysisEngineProcessException e1) { throw new IOException(e1); } InputStream is = null; try { is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()); // Create handler AncoraHandler handler = new AncoraHandler(); handler.setJCas(aJCas); handler.setLogger(getLogger()); // Parse XML SAXParserFactory pf = SAXParserFactory.newInstance(); SAXParser parser = pf.newSAXParser(); InputSource source = new InputSource(is); source.setPublicId(res.getLocation()); source.setSystemId(res.getLocation()); parser.parse(source, handler); } catch (ParserConfigurationException | SAXException e) { throw new IOException(e); } finally { closeQuietly(is); } if (dropSentencesMissingPosTags) { List<FeatureStructure> toRemove = new ArrayList<>(); // Remove sentences without pos TAGs for (Sentence s : select(aJCas, Sentence.class)) { boolean remove = false; for (Token t : selectCovered(Token.class, s)) { if (t.getPos() == null) { toRemove.add(s); remove = true; break; } } if (remove) { for (Token t : selectCovered(Token.class, s)) { toRemove.add(t); if (t.getLemma() != null) { toRemove.add(t.getLemma()); } if (t.getPos() != null) { toRemove.add(t.getPos()); } } } } for (FeatureStructure fs : toRemove) { aJCas.getCas().removeFsFromIndexes(fs); } // Remove tokens without pos tags that are located *BETWEEN* sentences! toRemove.clear(); for (Token t : select(aJCas, Token.class)) { if (t.getPos() == null) { toRemove.add(t); if (t.getLemma() != null) { toRemove.add(t.getLemma()); } if (t.getPos() != null) { toRemove.add(t.getPos()); } } } for (FeatureStructure fs : toRemove) { aJCas.getCas().removeFsFromIndexes(fs); } } }
From source file:com.twinsoft.convertigo.engine.util.XMLUtils.java
public static String prettyPrintDOM(String sDocument, String relativeUriResolver) throws ParserConfigurationException, SAXException, IOException { InputSource inputSource = new InputSource(new StringReader(sDocument)); inputSource.setSystemId(relativeUriResolver); Document document = getDefaultDocumentBuilder().parse(inputSource); return XMLUtils.prettyPrintDOM(document); }
From source file:com.cyberway.issue.crawler.settings.XMLSettingsHandler.java
/** Read the CrawlerSettings object from a specific file. * * @param settings the settings object to be updated with data from the * persistent storage./* w w w. j a v a2 s. c o m*/ * @param f the file to read from. * @return the updated settings object or null if there was no data for this * in the persistent storage. */ protected final CrawlerSettings readSettingsObject(CrawlerSettings settings, File f) { CrawlerSettings result = null; try { InputStream is = null; if (!f.exists()) { // Perhaps the file we're looking for is on the CLASSPATH. // DON'T look on the CLASSPATH for 'settings.xml' files. The // look for 'settings.xml' files happens frequently. Not looking // on classpath for 'settings.xml' is an optimization based on // ASSUMPTION that there will never be a 'settings.xml' saved // on classpath. if (!f.getName().startsWith(settingsFilename)) { is = XMLSettingsHandler.class.getResourceAsStream(f.getPath()); } } else { is = new FileInputStream(f); } if (is != null) { XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader(); InputStream file = new BufferedInputStream(is); parser.setContentHandler(new CrawlSettingsSAXHandler(settings)); InputSource source = new InputSource(file); source.setSystemId(f.toURL().toExternalForm()); parser.parse(source); result = settings; } } catch (SAXParseException e) { logger.warning(e.getMessage() + " in '" + e.getSystemId() + "', line: " + e.getLineNumber() + ", column: " + e.getColumnNumber()); } catch (SAXException e) { logger.warning(e.getMessage() + ": " + e.getException().getMessage()); } catch (ParserConfigurationException e) { logger.warning(e.getMessage() + ": " + e.getCause().getMessage()); } catch (FactoryConfigurationError e) { logger.warning(e.getMessage() + ": " + e.getException().getMessage()); } catch (IOException e) { logger.warning("Could not access file '" + f.getAbsolutePath() + "': " + e.getMessage()); } return result; }
From source file:de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader.java
private void nextTeiElement() throws CollectionException, IOException { if (teiElementIterator == null) { currentTeiElement = null;//from w w w .j a va 2 s .c o m return; } while (!teiElementIterator.hasNext() && super.hasNext()) { currentResource = nextFile(); InputStream is = null; try { is = currentResource.getInputStream(); if (currentResource.getPath().endsWith(".gz")) { is = new GZIPInputStream(is); } InputSource source = new InputSource(is); source.setPublicId(currentResource.getLocation()); source.setSystemId(currentResource.getLocation()); SAXReader reader = new SAXReader(); Document xml = reader.read(source); final XPath teiPath = new Dom4jXPath("//tei:TEI"); teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0"); List<Element> teiElements = teiPath.selectNodes(xml); // System.out.printf("Found %d TEI elements in %s.%n", teiElements.size(), // currentResource.getLocation()); teiElementIterator = teiElements.iterator(); currentTeiElementNumber = 0; } catch (DocumentException e) { throw new IOException(e); } catch (JaxenException e) { throw new IOException(e); } finally { closeQuietly(is); } } currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null; currentTeiElementNumber++; if (!super.hasNext() && !teiElementIterator.hasNext()) { // Mark end of processing. teiElementIterator = null; } }
From source file:de.tudarmstadt.ukp.clarin.webanno.tei.TeiReader.java
private void nextTeiElement() throws CollectionException, IOException { if (teiElementIterator == null) { currentTeiElement = null;// w w w . j a v a 2s. co m return; } while (!teiElementIterator.hasNext() && super.hasNext()) { currentResource = nextFile(); InputStream is = null; try { is = currentResource.getInputStream(); if (currentResource.getPath().endsWith(".gz")) { is = new GZIPInputStream(is); } InputSource source = new InputSource(is); source.setPublicId(currentResource.getLocation()); source.setSystemId(currentResource.getLocation()); SAXReader reader = new SAXReader(); Document xml = reader.read(source); final XPath teiPath = new Dom4jXPath("//tei:TEI"); teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0"); @SuppressWarnings("unchecked") List<Element> teiElements = teiPath.selectNodes(xml); teiElementIterator = teiElements.iterator(); currentTeiElementNumber = 0; } catch (DocumentException e) { throw new IOException(e); } catch (JaxenException e) { throw new IOException(e); } finally { closeQuietly(is); } } currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null; currentTeiElementNumber++; if (!super.hasNext() && !teiElementIterator.hasNext()) { // Mark end of processing. teiElementIterator = null; } }
From source file:net.sourceforge.dita4publishers.impl.dita.InMemoryDitaLinkManagementService.java
/** * @param keyAccessOptions Options that control access to resources. * @param resUrl Absolute URL of the target resource. * @return Root element of the target resource (topic or map). *///w w w . ja v a 2 s. c o m private Element resolveUriToElement(KeyAccessOptions keyAccessOptions, URL resUrl) throws DitaApiException { Element result = null; Document doc = null; String urlString = resUrl.toExternalForm(); String resUrlString = null; if (urlString.contains("#")) { resUrlString = urlString.substring(0, urlString.indexOf("#")); } else { resUrlString = urlString; } try { InputSource src = new InputSource(resUrl.openStream()); src.setSystemId(resUrlString); doc = DomUtil.getDomForSource(src, bosOptions, false); } catch (Exception e) { throw new DitaApiException("Exception contructing DOM from URL " + resUrl + ": " + e.getMessage(), e); } if (urlString.contains("#")) { String fragId = urlString.split("#")[1]; result = DitaUtil.resolveDitaFragmentId(doc, fragId); } else { result = DitaUtil.getImplicitElementFromDoc(doc); } return result; }
From source file:crawlercommons.sitemaps.SiteMapParser.java
/** * Decompress the gzipped content and process the resulting XML Sitemap. * /* www.ja va2s .c o m*/ * @param url * - URL of the gzipped content * @param response * - Gzipped content * @return the site map * @throws MalformedURLException * @throws IOException * @throws UnknownFormatException */ protected AbstractSiteMap processGzip(URL url, byte[] response) throws MalformedURLException, IOException, UnknownFormatException { LOG.debug("Processing gzip"); AbstractSiteMap smi; InputStream is = new ByteArrayInputStream(response); // Remove .gz ending String xmlUrl = url.toString().replaceFirst("\\.gz$", ""); LOG.debug("XML url = {}", xmlUrl); BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is)); InputSource in = new InputSource(decompressed); in.setSystemId(xmlUrl); smi = processXml(url, in); decompressed.close(); return smi; }
From source file:com.andyasprou.webcrawler.Utilities.GenericSiteMapParser.java
/** * Decompress the gzipped content and process the resulting XML Sitemap. * * @param url//from ww w . j a v a 2 s . c o m * - URL of the gzipped content * @param response * - Gzipped content * @return the site map * @throws UnknownFormatException if there is an error parsing the gzip * @throws IOException if there is an error reading in the gzip {@link java.net.URL} */ protected AbstractSiteMap processGzip(URL url, byte[] response) throws IOException, UnknownFormatException { AbstractSiteMap smi; InputStream is = new ByteArrayInputStream(response); // Remove .gz ending String xmlUrl = url.toString().replaceFirst("\\.gz$", ""); BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is)); InputSource in = new InputSource(decompressed); in.setSystemId(xmlUrl); smi = processXml(url, in); decompressed.close(); return smi; }
From source file:de.ii.xtraplatform.ogc.api.gml.parser.GMLSchemaParser.java
public void parse(InputSource is, Map<String, List<String>> elements, boolean lax) { //LOGGER.debug("Parsing GML application schema"); XSOMParser parser = new XSOMParser(); try {/*from w w w . j a v a 2 s . c o m*/ parser.setErrorHandler(new GMLSchemaParserErrorHandler()); parser.setEntityResolver(entityResolver); is.setSystemId(baseURI.toString()); parser.parse(is); XSSchemaSet schemas = parser.getResult(); for (GML.VERSION version : GML.VERSION.values()) { XSSchema schema0 = schemas.getSchema(GML.getWord(version, GML.NAMESPACE.URI)); if (schema0 != null) { XSElementDecl a = schema0.getElementDecl(GML.getWord(version, GML.VOCABULARY.ABSTRACT_OBJECT)); if (a != null) { abstractObjectDecl.add(a); } } } XSSchema schema1 = schemas.getSchema("http://www.isotc211.org/2005/gco"); if (schema1 != null) { gcoObjectType = schema1.getElementDecl("AbstractObject").getType(); } for (Map.Entry<String, List<String>> ns : elements.entrySet()) { String nsuri = ns.getKey(); String oldNsUri = null; //LOGGER.debug("namespace {}", nsuri); XSSchema schema = schemas.getSchema(nsuri); // workaround for broken WFSs where FeatureTypes are in different namespaces in Capabilities and Schema // in this case we search in the targetNamespace of the Schema if (schema == null && lax) { LOGGER.info( "Schema for Namespace '{}' not found, searching in targetNamespace schema instead. ", ns.getKey()); // looks as if the schema for the targetNamespace of the document is always second in the list schema = schemas.getSchema(1); oldNsUri = nsuri; nsuri = schema.getTargetNamespace(); } for (String e : ns.getValue()) { XSElementDecl elem = schema.getElementDecl(e); if (elem != null && elem.getType().isComplexType()) { //LOGGER.debug(" - element {}, type: {}", elem.getName(), elem.getType().getName()); for (GMLSchemaAnalyzer analyzer : analyzers) { if (oldNsUri != null) { analyzer.analyzeNamespaceRewrite(oldNsUri, nsuri, elem.getName()); } analyzer.analyzeFeatureType(nsuri, elem.getName()); for (XSAttributeUse att : elem.getType().asComplexType().getAttributeUses()) { //LOGGER.debug(" - attribute {}, required: {}, type: {}, ns: {}", att.getDecl().getName(), att.isRequired(), att.getDecl().getType().getName(), att.getDecl().getTargetNamespace()); analyzer.analyzeAttribute(att.getDecl().getTargetNamespace(), att.getDecl().getName(), att.getDecl().getType().getName(), att.isRequired()); } } XSParticle particle = elem.getType().asComplexType().getContentType().asParticle(); if (particle != null) { XSTerm term = particle.getTerm(); if (term.isModelGroup()) { complexTypes = new HashSet<String>(); parseGroup(term.asModelGroup(), 1, false); } } } } } } catch (SAXException ex) { // File included in schema not found if (ex.getCause() != null && ex.getCause().getClass().getName().contains("FileNotFoundException")) { LOGGER.error( "The GML application schema provided by the WFS imports schema '{}', but that schema cannot be accessed. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error.", ex.getCause().getMessage()); throw new SchemaParseException( "The GML application schema provided by the WFS imports schema '{}', but that schema cannot be accessed. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error.", ex.getCause().getMessage()); } String msg = ex.getMessage(); String msgex = ""; if (msg != null && !msg.isEmpty()) { msg = "Parser details: " + msg; msgex = msg.replaceAll("<", "<").replaceAll(">", ">"); } LOGGER.error( "The GML application schema provided by the WFS is invalid. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error. {}", msg); SchemaParseException spe = new SchemaParseException( "The GML application schema provided by the WFS is invalid. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error. {}", ""); spe.addDetail(msgex); throw spe; } }
From source file:net.sf.ginp.setup.SetupManagerImpl.java
/** * @param stream//from w w w.j ava 2s .co m * @return * @throws IOException * @throws DocumentException */ private Document validateConfig(final InputStream stream) throws IOException, DocumentException { SAXReader read = new SAXReader(true); InputSource source = new InputSource(); StringReader stringRead = new StringReader(GinpUtil.readBufferIntoMemory(stream)); String ginpFile = this.getClass().getResource("/net/sf/ginp/config/ginp.dtd").toExternalForm(); ginpFile = ginpFile.substring(0, ginpFile.lastIndexOf("/") + 1); source.setSystemId(ginpFile); source.setPublicId("ginp.dtd"); source.setCharacterStream(stringRead); return read.read(source); }