List of usage examples for org.apache.commons.lang3 CharEncoding isSupported
public static boolean isSupported(final String name)
From source file:com.norconex.importer.handler.tagger.AbstractCharStreamTagger.java
@Override protected final void tagApplicableDocument(String reference, InputStream document, ImporterMetadata metadata, boolean parsed) throws ImporterHandlerException { String contentType = metadata.getString("Content-Type", ""); contentType = StringUtils.substringBefore(contentType, ";"); String charset = metadata.getString("Content-Encoding", null); if (charset == null) { charset = metadata.getString("charset", null); }/*from w w w .j av a2s . co m*/ if (charset == null) { for (String type : metadata.getStrings("Content-Type")) { if (type.contains("charset")) { charset = StringUtils.trimToNull(StringUtils.substringAfter(type, "charset=")); break; } } } if (StringUtils.isBlank(charset) || !CharEncoding.isSupported(charset)) { charset = CharEncoding.UTF_8; } try { InputStreamReader is = new InputStreamReader(document, charset); tagTextDocument(reference, is, metadata, parsed); } catch (UnsupportedEncodingException e) { throw new ImporterHandlerException(e); } }
From source file:com.norconex.importer.parser.impl.xfdl.XFDLParser.java
@Override public List<ImporterDocument> parseDocument(ImporterDocument doc, Writer output) throws DocumentParserException { try {//from ww w.j a v a 2s. c om //TODO have a generic utility method for this? BufferedInputStream is = new BufferedInputStream(doc.getContent()); CharsetDetector detector = new CharsetDetector(); detector.enableInputFilter(true); detector.setText(is); CharsetMatch match = detector.detect(); String charset = CharEncoding.UTF_8; if (match != null && CharEncoding.isSupported(match.getName())) { charset = match.getName(); } BufferedReader reader = new BufferedReader(new InputStreamReader(is, charset)); parse(reader, output, doc.getMetadata()); } catch (IOException | ParserConfigurationException | SAXException e) { throw new DocumentParserException("Could not parse " + doc.getReference(), e); } return null; }
From source file:com.norconex.commons.lang.url.URLNormalizer.java
/** * Create a new <code>URLNormalizer</code> instance. * @param url the url to normalize// w w w . java 2 s. co m */ public URLNormalizer(String url) { super(); // make sure URL is valid String fixedURL = url; try { if (StringUtils.contains(fixedURL, " ")) { LOG.warn("URL syntax is invalid as it contains space " + "character(s). Replacing them with %20. URL: " + url); fixedURL = StringUtils.replace(fixedURL, " ", "%20"); } new URI(fixedURL); } catch (URISyntaxException e) { throw new URLException("Invalid URL syntax: " + url, e); } if (!CharEncoding.isSupported(CharEncoding.UTF_8)) { throw new URLException("UTF-8 is not supported by your system."); } this.url = fixedURL.trim(); }