Example usage for org.xml.sax InputSource setSystemId

Introduction

In this page you can find the example usage for org.xml.sax InputSource setSystemId.

Prototype

public void setSystemId(String systemId)

Source Link

Document

Set the system identifier for this input source.

Usage

From source file:de.tudarmstadt.ukp.dkpro.core.io.ancora.AncoraReader.java

@Override
public void getNext(JCas aJCas) throws IOException, CollectionException {
    Resource res = nextFile();//from  w  w w. j  a va2 s  . c o m
    initCas(aJCas, res);

    // Set up language
    if (getLanguage() != null) {
        aJCas.setDocumentLanguage(getLanguage());
    }

    // Configure mapping only now, because now the language is set in the CAS
    try {
        posMappingProvider.configure(aJCas.getCas());
    } catch (AnalysisEngineProcessException e1) {
        throw new IOException(e1);
    }

    InputStream is = null;
    try {
        is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream());

        // Create handler
        AncoraHandler handler = new AncoraHandler();
        handler.setJCas(aJCas);
        handler.setLogger(getLogger());

        // Parse XML
        SAXParserFactory pf = SAXParserFactory.newInstance();
        SAXParser parser = pf.newSAXParser();

        InputSource source = new InputSource(is);
        source.setPublicId(res.getLocation());
        source.setSystemId(res.getLocation());
        parser.parse(source, handler);
    } catch (ParserConfigurationException | SAXException e) {
        throw new IOException(e);
    } finally {
        closeQuietly(is);
    }

    if (dropSentencesMissingPosTags) {
        List<FeatureStructure> toRemove = new ArrayList<>();

        // Remove sentences without pos TAGs
        for (Sentence s : select(aJCas, Sentence.class)) {
            boolean remove = false;
            for (Token t : selectCovered(Token.class, s)) {
                if (t.getPos() == null) {
                    toRemove.add(s);
                    remove = true;
                    break;
                }
            }

            if (remove) {
                for (Token t : selectCovered(Token.class, s)) {
                    toRemove.add(t);
                    if (t.getLemma() != null) {
                        toRemove.add(t.getLemma());
                    }
                    if (t.getPos() != null) {
                        toRemove.add(t.getPos());
                    }
                }
            }
        }

        for (FeatureStructure fs : toRemove) {
            aJCas.getCas().removeFsFromIndexes(fs);
        }

        // Remove tokens without pos tags that are located *BETWEEN* sentences!
        toRemove.clear();
        for (Token t : select(aJCas, Token.class)) {
            if (t.getPos() == null) {
                toRemove.add(t);
                if (t.getLemma() != null) {
                    toRemove.add(t.getLemma());
                }
                if (t.getPos() != null) {
                    toRemove.add(t.getPos());
                }
            }
        }

        for (FeatureStructure fs : toRemove) {
            aJCas.getCas().removeFsFromIndexes(fs);
        }
    }
}

From source file:com.twinsoft.convertigo.engine.util.XMLUtils.java

public static String prettyPrintDOM(String sDocument, String relativeUriResolver)
        throws ParserConfigurationException, SAXException, IOException {
    InputSource inputSource = new InputSource(new StringReader(sDocument));
    inputSource.setSystemId(relativeUriResolver);
    Document document = getDefaultDocumentBuilder().parse(inputSource);
    return XMLUtils.prettyPrintDOM(document);
}

From source file:com.cyberway.issue.crawler.settings.XMLSettingsHandler.java

/** Read the CrawlerSettings object from a specific file.
 *
 * @param settings the settings object to be updated with data from the
 *                 persistent storage./* w w w.  j  a v  a2  s. c  o m*/
 * @param f the file to read from.
 * @return the updated settings object or null if there was no data for this
 *         in the persistent storage.
 */
protected final CrawlerSettings readSettingsObject(CrawlerSettings settings, File f) {
    CrawlerSettings result = null;
    try {
        InputStream is = null;
        if (!f.exists()) {
            // Perhaps the file we're looking for is on the CLASSPATH.
            // DON'T look on the CLASSPATH for 'settings.xml' files.  The
            // look for 'settings.xml' files happens frequently. Not looking
            // on classpath for 'settings.xml' is an optimization based on
            // ASSUMPTION that there will never be a 'settings.xml' saved
            // on classpath.
            if (!f.getName().startsWith(settingsFilename)) {
                is = XMLSettingsHandler.class.getResourceAsStream(f.getPath());
            }
        } else {
            is = new FileInputStream(f);
        }
        if (is != null) {
            XMLReader parser = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
            InputStream file = new BufferedInputStream(is);
            parser.setContentHandler(new CrawlSettingsSAXHandler(settings));
            InputSource source = new InputSource(file);
            source.setSystemId(f.toURL().toExternalForm());
            parser.parse(source);
            result = settings;
        }
    } catch (SAXParseException e) {
        logger.warning(e.getMessage() + " in '" + e.getSystemId() + "', line: " + e.getLineNumber()
                + ", column: " + e.getColumnNumber());
    } catch (SAXException e) {
        logger.warning(e.getMessage() + ": " + e.getException().getMessage());
    } catch (ParserConfigurationException e) {
        logger.warning(e.getMessage() + ": " + e.getCause().getMessage());
    } catch (FactoryConfigurationError e) {
        logger.warning(e.getMessage() + ": " + e.getException().getMessage());
    } catch (IOException e) {
        logger.warning("Could not access file '" + f.getAbsolutePath() + "': " + e.getMessage());
    }
    return result;
}

From source file:de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader.java

private void nextTeiElement() throws CollectionException, IOException {
    if (teiElementIterator == null) {
        currentTeiElement = null;//from   w w  w  .j a  va  2 s  .c  o  m
        return;
    }

    while (!teiElementIterator.hasNext() && super.hasNext()) {
        currentResource = nextFile();

        InputStream is = null;

        try {
            is = currentResource.getInputStream();

            if (currentResource.getPath().endsWith(".gz")) {
                is = new GZIPInputStream(is);
            }

            InputSource source = new InputSource(is);
            source.setPublicId(currentResource.getLocation());
            source.setSystemId(currentResource.getLocation());

            SAXReader reader = new SAXReader();
            Document xml = reader.read(source);

            final XPath teiPath = new Dom4jXPath("//tei:TEI");
            teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");

            List<Element> teiElements = teiPath.selectNodes(xml);

            //            System.out.printf("Found %d TEI elements in %s.%n", teiElements.size(),
            //                  currentResource.getLocation());

            teiElementIterator = teiElements.iterator();
            currentTeiElementNumber = 0;
        } catch (DocumentException e) {
            throw new IOException(e);
        } catch (JaxenException e) {
            throw new IOException(e);
        } finally {
            closeQuietly(is);
        }
    }

    currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null;
    currentTeiElementNumber++;

    if (!super.hasNext() && !teiElementIterator.hasNext()) {
        // Mark end of processing.
        teiElementIterator = null;
    }
}

From source file:de.tudarmstadt.ukp.clarin.webanno.tei.TeiReader.java

private void nextTeiElement() throws CollectionException, IOException {
    if (teiElementIterator == null) {
        currentTeiElement = null;// w  w w  .  j a  v  a 2s.  co m
        return;
    }

    while (!teiElementIterator.hasNext() && super.hasNext()) {
        currentResource = nextFile();

        InputStream is = null;

        try {
            is = currentResource.getInputStream();

            if (currentResource.getPath().endsWith(".gz")) {
                is = new GZIPInputStream(is);
            }

            InputSource source = new InputSource(is);
            source.setPublicId(currentResource.getLocation());
            source.setSystemId(currentResource.getLocation());

            SAXReader reader = new SAXReader();
            Document xml = reader.read(source);

            final XPath teiPath = new Dom4jXPath("//tei:TEI");
            teiPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");

            @SuppressWarnings("unchecked")
            List<Element> teiElements = teiPath.selectNodes(xml);

            teiElementIterator = teiElements.iterator();
            currentTeiElementNumber = 0;
        } catch (DocumentException e) {
            throw new IOException(e);
        } catch (JaxenException e) {
            throw new IOException(e);
        } finally {
            closeQuietly(is);
        }
    }

    currentTeiElement = teiElementIterator.hasNext() ? teiElementIterator.next() : null;
    currentTeiElementNumber++;

    if (!super.hasNext() && !teiElementIterator.hasNext()) {
        // Mark end of processing.
        teiElementIterator = null;
    }
}

From source file:net.sourceforge.dita4publishers.impl.dita.InMemoryDitaLinkManagementService.java

/**
 * @param keyAccessOptions Options that control access to resources.
 * @param resUrl Absolute URL of the target resource.
 * @return Root element of the target resource (topic or map).
 *///w w  w  .  ja  v a  2  s. c o m
private Element resolveUriToElement(KeyAccessOptions keyAccessOptions, URL resUrl) throws DitaApiException {
    Element result = null;
    Document doc = null;
    String urlString = resUrl.toExternalForm();
    String resUrlString = null;
    if (urlString.contains("#")) {
        resUrlString = urlString.substring(0, urlString.indexOf("#"));
    } else {
        resUrlString = urlString;
    }

    try {
        InputSource src = new InputSource(resUrl.openStream());
        src.setSystemId(resUrlString);
        doc = DomUtil.getDomForSource(src, bosOptions, false);
    } catch (Exception e) {
        throw new DitaApiException("Exception contructing DOM from URL " + resUrl + ": " + e.getMessage(), e);
    }

    if (urlString.contains("#")) {
        String fragId = urlString.split("#")[1];
        result = DitaUtil.resolveDitaFragmentId(doc, fragId);
    } else {
        result = DitaUtil.getImplicitElementFromDoc(doc);
    }

    return result;
}

From source file:crawlercommons.sitemaps.SiteMapParser.java

/**
 * Decompress the gzipped content and process the resulting XML Sitemap.
 * /*  www.ja  va2s  .c  o  m*/
 * @param url
 *            - URL of the gzipped content
 * @param response
 *            - Gzipped content
 * @return the site map
 * @throws MalformedURLException
 * @throws IOException
 * @throws UnknownFormatException
 */
protected AbstractSiteMap processGzip(URL url, byte[] response)
        throws MalformedURLException, IOException, UnknownFormatException {

    LOG.debug("Processing gzip");

    AbstractSiteMap smi;
    InputStream is = new ByteArrayInputStream(response);

    // Remove .gz ending
    String xmlUrl = url.toString().replaceFirst("\\.gz$", "");

    LOG.debug("XML url = {}", xmlUrl);

    BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is));
    InputSource in = new InputSource(decompressed);
    in.setSystemId(xmlUrl);
    smi = processXml(url, in);
    decompressed.close();
    return smi;
}

From source file:com.andyasprou.webcrawler.Utilities.GenericSiteMapParser.java

/**
 * Decompress the gzipped content and process the resulting XML Sitemap.
 *
 * @param url//from   ww w . j a  v  a 2  s  . c o  m
 *            - URL of the gzipped content
 * @param response
 *            - Gzipped content
 * @return the site map
 * @throws UnknownFormatException if there is an error parsing the gzip
 * @throws IOException if there is an error reading in the gzip {@link java.net.URL}
 */
protected AbstractSiteMap processGzip(URL url, byte[] response) throws IOException, UnknownFormatException {

    AbstractSiteMap smi;
    InputStream is = new ByteArrayInputStream(response);

    // Remove .gz ending
    String xmlUrl = url.toString().replaceFirst("\\.gz$", "");

    BOMInputStream decompressed = new BOMInputStream(new GZIPInputStream(is));
    InputSource in = new InputSource(decompressed);
    in.setSystemId(xmlUrl);
    smi = processXml(url, in);
    decompressed.close();
    return smi;
}

From source file:de.ii.xtraplatform.ogc.api.gml.parser.GMLSchemaParser.java

public void parse(InputSource is, Map<String, List<String>> elements, boolean lax) {
    //LOGGER.debug("Parsing GML application schema");
    XSOMParser parser = new XSOMParser();

    try {/*from   w w w  . j a v  a 2  s . c  o  m*/
        parser.setErrorHandler(new GMLSchemaParserErrorHandler());

        parser.setEntityResolver(entityResolver);

        is.setSystemId(baseURI.toString());

        parser.parse(is);

        XSSchemaSet schemas = parser.getResult();

        for (GML.VERSION version : GML.VERSION.values()) {
            XSSchema schema0 = schemas.getSchema(GML.getWord(version, GML.NAMESPACE.URI));
            if (schema0 != null) {
                XSElementDecl a = schema0.getElementDecl(GML.getWord(version, GML.VOCABULARY.ABSTRACT_OBJECT));
                if (a != null) {
                    abstractObjectDecl.add(a);
                }
            }
        }

        XSSchema schema1 = schemas.getSchema("http://www.isotc211.org/2005/gco");
        if (schema1 != null) {
            gcoObjectType = schema1.getElementDecl("AbstractObject").getType();
        }

        for (Map.Entry<String, List<String>> ns : elements.entrySet()) {
            String nsuri = ns.getKey();
            String oldNsUri = null;
            //LOGGER.debug("namespace {}", nsuri);

            XSSchema schema = schemas.getSchema(nsuri);

            // workaround for broken WFSs where FeatureTypes are in different namespaces in Capabilities and Schema
            // in this case we search in the targetNamespace of the Schema
            if (schema == null && lax) {
                LOGGER.info(
                        "Schema for Namespace '{}' not found, searching in targetNamespace schema instead. ",
                        ns.getKey());

                // looks as if the schema for the targetNamespace of the document is always second in the list
                schema = schemas.getSchema(1);
                oldNsUri = nsuri;
                nsuri = schema.getTargetNamespace();
            }

            for (String e : ns.getValue()) {
                XSElementDecl elem = schema.getElementDecl(e);
                if (elem != null && elem.getType().isComplexType()) {
                    //LOGGER.debug(" - element {}, type: {}", elem.getName(), elem.getType().getName());

                    for (GMLSchemaAnalyzer analyzer : analyzers) {
                        if (oldNsUri != null) {
                            analyzer.analyzeNamespaceRewrite(oldNsUri, nsuri, elem.getName());
                        }
                        analyzer.analyzeFeatureType(nsuri, elem.getName());
                        for (XSAttributeUse att : elem.getType().asComplexType().getAttributeUses()) {
                            //LOGGER.debug("   - attribute {}, required: {}, type: {}, ns: {}", att.getDecl().getName(), att.isRequired(), att.getDecl().getType().getName(), att.getDecl().getTargetNamespace());

                            analyzer.analyzeAttribute(att.getDecl().getTargetNamespace(),
                                    att.getDecl().getName(), att.getDecl().getType().getName(),
                                    att.isRequired());
                        }
                    }
                    XSParticle particle = elem.getType().asComplexType().getContentType().asParticle();
                    if (particle != null) {
                        XSTerm term = particle.getTerm();
                        if (term.isModelGroup()) {
                            complexTypes = new HashSet<String>();
                            parseGroup(term.asModelGroup(), 1, false);
                        }
                    }
                }
            }
        }
    } catch (SAXException ex) {

        // File included in schema not found
        if (ex.getCause() != null && ex.getCause().getClass().getName().contains("FileNotFoundException")) {
            LOGGER.error(
                    "The GML application schema provided by the WFS imports schema '{}', but that schema cannot be accessed. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error.",
                    ex.getCause().getMessage());
            throw new SchemaParseException(
                    "The GML application schema provided by the WFS imports schema '{}', but that schema cannot be accessed. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error.",
                    ex.getCause().getMessage());
        }

        String msg = ex.getMessage();
        String msgex = "";
        if (msg != null && !msg.isEmpty()) {
            msg = "Parser details: " + msg;

            msgex = msg.replaceAll("<", "&lt;").replaceAll(">", "&gt;");
        }

        LOGGER.error(
                "The GML application schema provided by the WFS is invalid. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error. {}",
                msg);
        SchemaParseException spe = new SchemaParseException(
                "The GML application schema provided by the WFS is invalid. A valid GML application schema is required to determine the layers of the proxy service and its characteristics.. Please contact the WFS provider to correct the schema error. {}",
                "");

        spe.addDetail(msgex);

        throw spe;
    }
}

From source file:net.sf.ginp.setup.SetupManagerImpl.java

/**
 * @param stream//from w w w.j ava  2s  .co m
 * @return
 * @throws IOException
 * @throws DocumentException
 */
private Document validateConfig(final InputStream stream) throws IOException, DocumentException {
    SAXReader read = new SAXReader(true);
    InputSource source = new InputSource();
    StringReader stringRead = new StringReader(GinpUtil.readBufferIntoMemory(stream));
    String ginpFile = this.getClass().getResource("/net/sf/ginp/config/ginp.dtd").toExternalForm();
    ginpFile = ginpFile.substring(0, ginpFile.lastIndexOf("/") + 1);
    source.setSystemId(ginpFile);
    source.setPublicId("ginp.dtd");
    source.setCharacterStream(stringRead);

    return read.read(source);
}