Example usage for org.xml.sax InputSource setEncoding

List of usage examples for org.xml.sax InputSource setEncoding

Introduction

In this page you can find the example usage for org.xml.sax InputSource setEncoding.

Prototype

public void setEncoding(String encoding) 

Source Link

Document

Set the character encoding, if known.

Usage

From source file:org.corpus_tools.pepper.core.PepperJobImpl.java

/**
 * {@inheritDoc PepperJob#load(URI)}/*  w w  w.  j  av  a  2 s  . c o m*/
 */
@Override
public void load(URI uri) {
    if (uri.isFile()) {
        File wdFile = new File(uri.toFileString());
        // set folder containing workflow description as base dir
        setBaseDir(uri.trimSegments(1));

        SAXParser parser;
        XMLReader xmlReader;
        SAXParserFactory factory = SAXParserFactory.newInstance();

        WorkflowDescriptionReader contentHandler = new WorkflowDescriptionReader();
        contentHandler.setPepperJob(this);
        contentHandler.setLocation(uri);

        // remove all existing steps
        clear();

        try {
            parser = factory.newSAXParser();
            xmlReader = parser.getXMLReader();
            xmlReader.setContentHandler(contentHandler);
        } catch (ParserConfigurationException e) {
            throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '"
                    + wdFile.getAbsolutePath() + "': " + e.getMessage() + ". ", e);
        } catch (Exception e) {
            throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '"
                    + wdFile.getAbsolutePath() + "': " + e.getMessage() + ". ", e);
        }
        try {
            InputStream inputStream = new FileInputStream(wdFile);
            Reader reader = new InputStreamReader(inputStream, "UTF-8");
            InputSource is = new InputSource(reader);
            is.setEncoding("UTF-8");
            xmlReader.parse(is);
        } catch (SAXException e) {
            try {
                parser = factory.newSAXParser();
                xmlReader = parser.getXMLReader();
                xmlReader.setContentHandler(contentHandler);
                xmlReader.parse(wdFile.getAbsolutePath());
            } catch (Exception e1) {
                throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '"
                        + wdFile.getAbsolutePath() + "': " + e1.getMessage() + ". ", e1);
            }
        } catch (Exception e) {
            if (e instanceof PepperModuleException) {
                throw (PepperModuleException) e;
            } else {
                throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file'"
                        + wdFile + "', because of a nested exception: " + e.getMessage() + ". ", e);
            }
        }
    } else {
        throw new UnsupportedOperationException(
                "Currently Pepper can only load workflow description from local files.");
    }
}

From source file:org.corpus_tools.salt.util.SaltUtil.java

/**
 * Loads a list of root objects coming from a SaltXML (.{@link #FILE_ENDING_SALT_XML})
 * and returns it.//w  w  w  .j  a va2s  . co m
 * 
 * @param objectURI
 *            {@link URI} to SaltXML file containing the object
 * @return loaded objects
 */
public static List<Object> loadObjects(URI location) {
    if (location == null) {
        throw new SaltResourceException("Cannot load Salt object, because the given uri is null.");
    }
    File objectFile = new File(
            (location.toFileString() == null) ? location.toString() : location.toFileString());
    if (!objectFile.exists()) {
        throw new SaltResourceException("Cannot load Salt object, because the file '"
                + objectFile.getAbsolutePath() + "' does not exist.");
    }

    SAXParser parser;
    XMLReader xmlReader;
    SAXParserFactory factory = SAXParserFactory.newInstance();
    SaltXML10Handler contentHandler = new SaltXML10Handler();

    try {
        parser = factory.newSAXParser();
        xmlReader = parser.getXMLReader();
        xmlReader.setContentHandler(contentHandler);
    } catch (ParserConfigurationException e) {
        throw new SaltResourceException(
                "Cannot load Salt object from file '" + objectFile.getAbsolutePath() + "'.", e);
    } catch (Exception e) {
        throw new SaltResourceException(
                "Cannot load Salt object from file '" + objectFile.getAbsolutePath() + "'.", e);
    }
    try {
        InputStream inputStream = new FileInputStream(objectFile);
        Reader reader = new InputStreamReader(inputStream, "UTF-8");
        InputSource is = new InputSource(reader);
        is.setEncoding("UTF-8");
        xmlReader.parse(is);
    } catch (SAXException e) {
        try {
            parser = factory.newSAXParser();
            xmlReader = parser.getXMLReader();
            xmlReader.setContentHandler(contentHandler);
            xmlReader.parse(objectFile.getAbsolutePath());
        } catch (Exception e1) {
            throw new SaltResourceException(
                    "Cannot load Salt object from file '" + objectFile.getAbsolutePath() + "'.", e1);
        }
    } catch (Exception e) {
        if (e instanceof SaltException) {
            throw (SaltException) e;
        } else {
            throw new SaltResourceException(
                    "Cannot load Salt object from file'" + objectFile + "', because of a nested exception. ",
                    e);
        }
    }
    return contentHandler.getRootObjects();
}

From source file:org.eclipse.smila.connectivity.framework.crawler.web.parse.html.HtmlParser.java

/**
 * Returns the {@link Parse} result for the given {@link Content}.
 * /*from   w  ww . java2s. c o m*/
 * @param content
 *          Content to be parsed.
 * 
 * @return Parse
 */
public Parse getParse(Content content) {
    final HTMLMetaTags metaTags = new HTMLMetaTags();

    URL base = null;
    try {
        base = new URL(content.getBaseUrl());
    } catch (MalformedURLException exception) {
        return new ParseStatus(exception).getEmptyParse(getConf());
    }

    String title = "";
    String text = "";
    Outlink[] outlinks = new Outlink[0];
    final List<Outlink> links = new ArrayList<Outlink>();

    final Metadata metadata = new Metadata();

    // parse the content
    DocumentFragment root = null;
    try {
        final byte[] contentInOctets = content.getContent();
        final InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
        final String contentType = content.getMetadata().get(Response.CONTENT_TYPE);

        if (!(s_textPattern.matcher(contentType).find() || s_htmlPattern.matcher(contentType).find())) {
            final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
            final ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata,
                    metaTags);
            parseData.setConf(this._configuration);
            final Parse parse = new ParseImpl(text, parseData);
            return parse;
        }

        String encoding = StringUtil.parseCharacterEncoding(contentType);
        if ((encoding != null) && !("".equals(encoding))) {
            metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
            encoding = StringUtil.resolveEncodingAlias(encoding);
            if (encoding != null) {
                metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
                if (_log.isTraceEnabled()) {
                    _log.trace(base + ": setting encoding to " + encoding);
                }
            }
        }

        // sniff out 'charset' value from the beginning of a document
        if ((encoding == null) || ("".equals(encoding))) {
            encoding = sniffCharacterEncoding(contentInOctets);
            if (encoding != null) {
                metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
                encoding = StringUtil.resolveEncodingAlias(encoding);
                if (encoding != null) {
                    metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
                    if (_log.isTraceEnabled()) {
                        _log.trace(base + ": setting encoding to " + encoding);
                    }
                }
            }
        }

        if (encoding == null) {
            // fallback encoding.
            // FIXME : In addition to the global fallback value,
            // we should make it possible to specify fallback encodings for each
            // ccTLD.
            // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
            // doesn't work for jp because euc-jp and shift_jis have about the
            // same share)
            encoding = _defaultCharEncoding;
            metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, _defaultCharEncoding);
            if (_log.isTraceEnabled()) {
                _log.trace(base + ": falling back to " + _defaultCharEncoding);
            }
        }

        input.setEncoding(encoding);
        if (_log.isTraceEnabled()) {
            _log.trace("Parsing...");
        }
        root = parse(input);
    } catch (IOException exception) {
        return new ParseStatus(exception).getEmptyParse(getConf());
    } catch (DOMException exception) {
        return new ParseStatus(exception).getEmptyParse(getConf());
    } catch (SAXException exception) {
        return new ParseStatus(exception).getEmptyParse(getConf());
    } catch (Exception exception) {
        _log.error("Unknown parsing error", exception);
        return new ParseStatus(exception).getEmptyParse(getConf());
    }

    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (_log.isTraceEnabled()) {
        _log.trace("Meta tags for " + base + ": " + metaTags.toString());
    }
    // check meta directives
    // ok to index
    if (!metaTags.getNoIndex()) {
        if (_log.isDebugEnabled()) {
            _log.debug("Getting title");
        }
        final StringBuffer textBuffer = new StringBuffer();
        _utils.getText(textBuffer, root);
        text = textBuffer.toString();

        if (_log.isDebugEnabled()) {
            _log.debug("Getting title");
        }
        final StringBuffer titleBuffer = new StringBuffer();
        _utils.getTitle(titleBuffer, root);
        title = titleBuffer.toString().trim();
    }

    // ok to follow links
    if (!metaTags.getNoFollow()) {
        // extract outlinks
        final URL baseTag = _utils.getBase(root);
        if (_log.isTraceEnabled()) {
            _log.trace("Getting links...");
        }
        if (baseTag == null) {
            _utils.getOutlinks(base, links, root);
        } else {
            _utils.getOutlinks(baseTag, links, root);
        }
        if (_log.isDebugEnabled()) {
            _log.debug("found " + links.size() + " outlinks in " + content.getUrl());
            for (Outlink outlink : links) {
                _log.debug(outlink.toString());
            }
        }
    }

    final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
        status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
        status.setMessage(metaTags.getRefreshHref().toString());
    }

    if (_javascriptParser != null) {
        final List<Outlink> javascriptLinks = new ArrayList<Outlink>();
        _javascriptParser.setConf(_configuration);
        _utils.setJavascriptParser(_javascriptParser);
        _utils.getJavascriptOutlinks(base.toString(), javascriptLinks, root);
        if (_log.isDebugEnabled()) {
            _log.debug("found " + javascriptLinks.size() + " javascript outlinks in " + content.getUrl());
            for (Outlink outlink : javascriptLinks) {
                _log.debug(outlink.toString());
            }
        }
        links.addAll(javascriptLinks);
    }

    outlinks = (Outlink[]) links.toArray(new Outlink[links.size()]);
    final ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata,
            metaTags);
    parseData.setConf(_configuration);
    final Parse parse = new ParseImpl(text, parseData);
    return parse;
}

From source file:org.exist.collections.Collection.java

private InputSource closeShieldInputSource(final InputSource source) {

    final InputSource protectedInputSource = new InputSource();
    protectedInputSource.setEncoding(source.getEncoding());
    protectedInputSource.setSystemId(source.getSystemId());
    protectedInputSource.setPublicId(source.getPublicId());

    if (source.getByteStream() != null) {
        //TODO consider AutoCloseInputStream
        final InputStream closeShieldByteStream = new CloseShieldInputStream(source.getByteStream());
        protectedInputSource.setByteStream(closeShieldByteStream);
    }//from w w w  . jav a  2s.c  o  m

    if (source.getCharacterStream() != null) {
        //TODO consider AutoCloseReader
        final Reader closeShieldReader = new CloseShieldReader(source.getCharacterStream());
        protectedInputSource.setCharacterStream(closeShieldReader);
    }

    return protectedInputSource;
}

From source file:org.exist.collections.MutableCollection.java

private InputSource closeShieldInputSource(final InputSource source) {
    final InputSource protectedInputSource = new InputSource();
    protectedInputSource.setEncoding(source.getEncoding());
    protectedInputSource.setSystemId(source.getSystemId());
    protectedInputSource.setPublicId(source.getPublicId());

    if (source.getByteStream() != null) {
        //TODO consider AutoCloseInputStream
        final InputStream closeShieldByteStream = new CloseShieldInputStream(source.getByteStream());
        protectedInputSource.setByteStream(closeShieldByteStream);
    }//from www  .  ja v  a  2 s .c  om

    if (source.getCharacterStream() != null) {
        //TODO consider AutoCloseReader
        final Reader closeShieldReader = new CloseShieldReader(source.getCharacterStream());
        protectedInputSource.setCharacterStream(closeShieldReader);
    }

    return protectedInputSource;
}

From source file:org.geoserver.test.GeoServerSystemTestSupport.java

protected Document dom(InputStream stream, boolean skipDTD, String encoding)
        throws ParserConfigurationException, SAXException, IOException {

    InputSource input = new InputSource(stream);
    if (encoding != null) {
        input.setEncoding(encoding);
    }/*from  w  ww. j  a v a2s.  c  om*/

    if (skipDTD) {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

        factory.setNamespaceAware(true);
        factory.setValidating(false);

        DocumentBuilder builder = factory.newDocumentBuilder();
        builder.setEntityResolver(new EmptyResolver());
        Document dom = builder.parse(input);

        return dom;
    } else {
        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
        factory.setNamespaceAware(true);
        DocumentBuilder builder = factory.newDocumentBuilder();
        return builder.parse(input);
    }
}

From source file:org.kalypso.ogc.gml.GisTemplateHelper.java

public static final Gismapview loadGisMapView(final IStorage storage)
        throws JAXBException, CoreException, SAXException, ParserConfigurationException, IOException {
    InputStream inputStream = null;
    try {/*from  w  w  w .  j av a2 s .c  om*/
        inputStream = storage.getContents();
        final InputSource is = new InputSource(inputStream);
        if (storage instanceof IEncodedStorage)
            is.setEncoding(((IEncodedStorage) storage).getCharset());
        final Gismapview gisMapView = GisTemplateHelper.loadGisMapView(is);
        inputStream.close();
        return gisMapView;
    } finally {
        IOUtils.closeQuietly(inputStream);
    }
}

From source file:org.kalypso.ogc.gml.GisTemplateHelper.java

public static final Gistableview loadGisTableview(final IStorage storage) throws CoreException, JAXBException {
    final InputSource is = new InputSource(storage.getContents());
    if (storage instanceof IEncodedStorage)
        is.setEncoding(((IEncodedStorage) storage).getCharset());
    return GisTemplateHelper.loadGisTableview(is);
}

From source file:org.kalypso.ogc.gml.GisTemplateHelper.java

public static final Gistreeview loadGisTreeView(final IStorage file) throws JAXBException, CoreException {
    final InputSource is = new InputSource(file.getContents());
    if (file instanceof IEncodedStorage)
        is.setEncoding(((IEncodedStorage) file).getCharset());
    return GisTemplateHelper.loadGisTreeView(is);
}

From source file:org.lockss.plugin.clockss.cambridge.CambridgeXPathXmlMetadataParser.java

@Override
protected InputSource makeInputSource(CachedUrl cu) throws IOException {
    String url = cu.getUrl();/*from w ww.ja v a  2 s .c  om*/
    // If this is an sgml file, we need to make it conform to xml rules
    if ((url != null) && url.endsWith(".sgm")) {
        log.debug3("filtering sgml in to conforming xml");
        Pair<Reader, String> sgmlReaderPair = makeInputSourceReader(cu);
        // clean up non-terminated tags
        Reader xmlReader = new CambridgeSgmlAdapter(sgmlReaderPair.getLeft());
        InputSource is = new InputSource(xmlReader);
        is.setEncoding(sgmlReaderPair.getRight());
        return is;
    }
    // This already was an xml file
    return super.makeInputSource(cu);
}