List of usage examples for org.xml.sax InputSource setEncoding
public void setEncoding(String encoding)
From source file:org.corpus_tools.pepper.core.PepperJobImpl.java
/** * {@inheritDoc PepperJob#load(URI)}/* w w w. j av a 2 s . c o m*/ */ @Override public void load(URI uri) { if (uri.isFile()) { File wdFile = new File(uri.toFileString()); // set folder containing workflow description as base dir setBaseDir(uri.trimSegments(1)); SAXParser parser; XMLReader xmlReader; SAXParserFactory factory = SAXParserFactory.newInstance(); WorkflowDescriptionReader contentHandler = new WorkflowDescriptionReader(); contentHandler.setPepperJob(this); contentHandler.setLocation(uri); // remove all existing steps clear(); try { parser = factory.newSAXParser(); xmlReader = parser.getXMLReader(); xmlReader.setContentHandler(contentHandler); } catch (ParserConfigurationException e) { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '" + wdFile.getAbsolutePath() + "': " + e.getMessage() + ". ", e); } catch (Exception e) { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '" + wdFile.getAbsolutePath() + "': " + e.getMessage() + ". ", e); } try { InputStream inputStream = new FileInputStream(wdFile); Reader reader = new InputStreamReader(inputStream, "UTF-8"); InputSource is = new InputSource(reader); is.setEncoding("UTF-8"); xmlReader.parse(is); } catch (SAXException e) { try { parser = factory.newSAXParser(); xmlReader = parser.getXMLReader(); xmlReader.setContentHandler(contentHandler); xmlReader.parse(wdFile.getAbsolutePath()); } catch (Exception e1) { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '" + wdFile.getAbsolutePath() + "': " + e1.getMessage() + ". ", e1); } } catch (Exception e) { if (e instanceof PepperModuleException) { throw (PepperModuleException) e; } else { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file'" + wdFile + "', because of a nested exception: " + e.getMessage() + ". ", e); } } } else { throw new UnsupportedOperationException( "Currently Pepper can only load workflow description from local files."); } }
From source file:org.corpus_tools.salt.util.SaltUtil.java
/** * Loads a list of root objects coming from a SaltXML (.{@link #FILE_ENDING_SALT_XML}) * and returns it.//w w w .j a va2s . co m * * @param objectURI * {@link URI} to SaltXML file containing the object * @return loaded objects */ public static List<Object> loadObjects(URI location) { if (location == null) { throw new SaltResourceException("Cannot load Salt object, because the given uri is null."); } File objectFile = new File( (location.toFileString() == null) ? location.toString() : location.toFileString()); if (!objectFile.exists()) { throw new SaltResourceException("Cannot load Salt object, because the file '" + objectFile.getAbsolutePath() + "' does not exist."); } SAXParser parser; XMLReader xmlReader; SAXParserFactory factory = SAXParserFactory.newInstance(); SaltXML10Handler contentHandler = new SaltXML10Handler(); try { parser = factory.newSAXParser(); xmlReader = parser.getXMLReader(); xmlReader.setContentHandler(contentHandler); } catch (ParserConfigurationException e) { throw new SaltResourceException( "Cannot load Salt object from file '" + objectFile.getAbsolutePath() + "'.", e); } catch (Exception e) { throw new SaltResourceException( "Cannot load Salt object from file '" + objectFile.getAbsolutePath() + "'.", e); } try { InputStream inputStream = new FileInputStream(objectFile); Reader reader = new InputStreamReader(inputStream, "UTF-8"); InputSource is = new InputSource(reader); is.setEncoding("UTF-8"); xmlReader.parse(is); } catch (SAXException e) { try { parser = factory.newSAXParser(); xmlReader = parser.getXMLReader(); xmlReader.setContentHandler(contentHandler); xmlReader.parse(objectFile.getAbsolutePath()); } catch (Exception e1) { throw new SaltResourceException( "Cannot load Salt object from file '" + objectFile.getAbsolutePath() + "'.", e1); } } catch (Exception e) { if (e instanceof SaltException) { throw (SaltException) e; } else { throw new SaltResourceException( "Cannot load Salt object from file'" + objectFile + "', because of a nested exception. ", e); } } return contentHandler.getRootObjects(); }
From source file:org.eclipse.smila.connectivity.framework.crawler.web.parse.html.HtmlParser.java
/** * Returns the {@link Parse} result for the given {@link Content}. * /*from w ww . java2s. c o m*/ * @param content * Content to be parsed. * * @return Parse */ public Parse getParse(Content content) { final HTMLMetaTags metaTags = new HTMLMetaTags(); URL base = null; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException exception) { return new ParseStatus(exception).getEmptyParse(getConf()); } String title = ""; String text = ""; Outlink[] outlinks = new Outlink[0]; final List<Outlink> links = new ArrayList<Outlink>(); final Metadata metadata = new Metadata(); // parse the content DocumentFragment root = null; try { final byte[] contentInOctets = content.getContent(); final InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets)); final String contentType = content.getMetadata().get(Response.CONTENT_TYPE); if (!(s_textPattern.matcher(contentType).find() || s_htmlPattern.matcher(contentType).find())) { final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); final ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata, metaTags); parseData.setConf(this._configuration); final Parse parse = new ParseImpl(text, parseData); return parse; } String encoding = StringUtil.parseCharacterEncoding(contentType); if ((encoding != null) && !("".equals(encoding))) { metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); encoding = StringUtil.resolveEncodingAlias(encoding); if (encoding != null) { metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); if (_log.isTraceEnabled()) { _log.trace(base + ": setting encoding to " + encoding); } } } // sniff out 'charset' value from the beginning of a document if ((encoding == null) || ("".equals(encoding))) { encoding = sniffCharacterEncoding(contentInOctets); if (encoding != null) { metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding); encoding = StringUtil.resolveEncodingAlias(encoding); if (encoding != null) { metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding); if (_log.isTraceEnabled()) { _log.trace(base + ": setting encoding to " + encoding); } } } } if (encoding == null) { // fallback encoding. // FIXME : In addition to the global fallback value, // we should make it possible to specify fallback encodings for each // ccTLD. // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5 // doesn't work for jp because euc-jp and shift_jis have about the // same share) encoding = _defaultCharEncoding; metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, _defaultCharEncoding); if (_log.isTraceEnabled()) { _log.trace(base + ": falling back to " + _defaultCharEncoding); } } input.setEncoding(encoding); if (_log.isTraceEnabled()) { _log.trace("Parsing..."); } root = parse(input); } catch (IOException exception) { return new ParseStatus(exception).getEmptyParse(getConf()); } catch (DOMException exception) { return new ParseStatus(exception).getEmptyParse(getConf()); } catch (SAXException exception) { return new ParseStatus(exception).getEmptyParse(getConf()); } catch (Exception exception) { _log.error("Unknown parsing error", exception); return new ParseStatus(exception).getEmptyParse(getConf()); } // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (_log.isTraceEnabled()) { _log.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives // ok to index if (!metaTags.getNoIndex()) { if (_log.isDebugEnabled()) { _log.debug("Getting title"); } final StringBuffer textBuffer = new StringBuffer(); _utils.getText(textBuffer, root); text = textBuffer.toString(); if (_log.isDebugEnabled()) { _log.debug("Getting title"); } final StringBuffer titleBuffer = new StringBuffer(); _utils.getTitle(titleBuffer, root); title = titleBuffer.toString().trim(); } // ok to follow links if (!metaTags.getNoFollow()) { // extract outlinks final URL baseTag = _utils.getBase(root); if (_log.isTraceEnabled()) { _log.trace("Getting links..."); } if (baseTag == null) { _utils.getOutlinks(base, links, root); } else { _utils.getOutlinks(baseTag, links, root); } if (_log.isDebugEnabled()) { _log.debug("found " + links.size() + " outlinks in " + content.getUrl()); for (Outlink outlink : links) { _log.debug(outlink.toString()); } } } final ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setMessage(metaTags.getRefreshHref().toString()); } if (_javascriptParser != null) { final List<Outlink> javascriptLinks = new ArrayList<Outlink>(); _javascriptParser.setConf(_configuration); _utils.setJavascriptParser(_javascriptParser); _utils.getJavascriptOutlinks(base.toString(), javascriptLinks, root); if (_log.isDebugEnabled()) { _log.debug("found " + javascriptLinks.size() + " javascript outlinks in " + content.getUrl()); for (Outlink outlink : javascriptLinks) { _log.debug(outlink.toString()); } } links.addAll(javascriptLinks); } outlinks = (Outlink[]) links.toArray(new Outlink[links.size()]); final ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), metadata, metaTags); parseData.setConf(_configuration); final Parse parse = new ParseImpl(text, parseData); return parse; }
From source file:org.exist.collections.Collection.java
private InputSource closeShieldInputSource(final InputSource source) { final InputSource protectedInputSource = new InputSource(); protectedInputSource.setEncoding(source.getEncoding()); protectedInputSource.setSystemId(source.getSystemId()); protectedInputSource.setPublicId(source.getPublicId()); if (source.getByteStream() != null) { //TODO consider AutoCloseInputStream final InputStream closeShieldByteStream = new CloseShieldInputStream(source.getByteStream()); protectedInputSource.setByteStream(closeShieldByteStream); }//from w w w . jav a 2s.c o m if (source.getCharacterStream() != null) { //TODO consider AutoCloseReader final Reader closeShieldReader = new CloseShieldReader(source.getCharacterStream()); protectedInputSource.setCharacterStream(closeShieldReader); } return protectedInputSource; }
From source file:org.exist.collections.MutableCollection.java
private InputSource closeShieldInputSource(final InputSource source) { final InputSource protectedInputSource = new InputSource(); protectedInputSource.setEncoding(source.getEncoding()); protectedInputSource.setSystemId(source.getSystemId()); protectedInputSource.setPublicId(source.getPublicId()); if (source.getByteStream() != null) { //TODO consider AutoCloseInputStream final InputStream closeShieldByteStream = new CloseShieldInputStream(source.getByteStream()); protectedInputSource.setByteStream(closeShieldByteStream); }//from www . ja v a 2 s .c om if (source.getCharacterStream() != null) { //TODO consider AutoCloseReader final Reader closeShieldReader = new CloseShieldReader(source.getCharacterStream()); protectedInputSource.setCharacterStream(closeShieldReader); } return protectedInputSource; }
From source file:org.geoserver.test.GeoServerSystemTestSupport.java
protected Document dom(InputStream stream, boolean skipDTD, String encoding) throws ParserConfigurationException, SAXException, IOException { InputSource input = new InputSource(stream); if (encoding != null) { input.setEncoding(encoding); }/*from w ww. j a v a2s. c om*/ if (skipDTD) { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); factory.setValidating(false); DocumentBuilder builder = factory.newDocumentBuilder(); builder.setEntityResolver(new EmptyResolver()); Document dom = builder.parse(input); return dom; } else { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); return builder.parse(input); } }
From source file:org.kalypso.ogc.gml.GisTemplateHelper.java
public static final Gismapview loadGisMapView(final IStorage storage) throws JAXBException, CoreException, SAXException, ParserConfigurationException, IOException { InputStream inputStream = null; try {/*from w w w . j av a2 s .c om*/ inputStream = storage.getContents(); final InputSource is = new InputSource(inputStream); if (storage instanceof IEncodedStorage) is.setEncoding(((IEncodedStorage) storage).getCharset()); final Gismapview gisMapView = GisTemplateHelper.loadGisMapView(is); inputStream.close(); return gisMapView; } finally { IOUtils.closeQuietly(inputStream); } }
From source file:org.kalypso.ogc.gml.GisTemplateHelper.java
public static final Gistableview loadGisTableview(final IStorage storage) throws CoreException, JAXBException { final InputSource is = new InputSource(storage.getContents()); if (storage instanceof IEncodedStorage) is.setEncoding(((IEncodedStorage) storage).getCharset()); return GisTemplateHelper.loadGisTableview(is); }
From source file:org.kalypso.ogc.gml.GisTemplateHelper.java
public static final Gistreeview loadGisTreeView(final IStorage file) throws JAXBException, CoreException { final InputSource is = new InputSource(file.getContents()); if (file instanceof IEncodedStorage) is.setEncoding(((IEncodedStorage) file).getCharset()); return GisTemplateHelper.loadGisTreeView(is); }
From source file:org.lockss.plugin.clockss.cambridge.CambridgeXPathXmlMetadataParser.java
@Override protected InputSource makeInputSource(CachedUrl cu) throws IOException { String url = cu.getUrl();/*from w ww.ja v a 2 s .c om*/ // If this is an sgml file, we need to make it conform to xml rules if ((url != null) && url.endsWith(".sgm")) { log.debug3("filtering sgml in to conforming xml"); Pair<Reader, String> sgmlReaderPair = makeInputSourceReader(cu); // clean up non-terminated tags Reader xmlReader = new CambridgeSgmlAdapter(sgmlReaderPair.getLeft()); InputSource is = new InputSource(xmlReader); is.setEncoding(sgmlReaderPair.getRight()); return is; } // This already was an xml file return super.makeInputSource(cu); }