List of usage examples for org.apache.commons.io ByteOrderMark UTF_16LE
ByteOrderMark UTF_16LE
To view the source code for org.apache.commons.io ByteOrderMark UTF_16LE.
Click Source Link
From source file:com.ibm.wala.cast.ipa.callgraph.CAstCallGraphUtil.java
public static SourceFileModule makeSourceModule(URL script, String scriptName) { String hackedName = script.getFile().replaceAll("%5c", "/").replaceAll("%20", " "); File scriptFile = new File(hackedName); assert hackedName.endsWith(scriptName) : scriptName + " does not match file " + script.getFile(); return new SourceFileModule(scriptFile, scriptName, null) { @Override//from www.j a va 2 s.co m public InputStream getInputStream() { BOMInputStream bs = new BOMInputStream(super.getInputStream(), false, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); try { if (bs.hasBOM()) { System.err.println("removing BOM " + bs.getBOM()); } return bs; } catch (IOException e) { return super.getInputStream(); } } }; }
From source file:ca.nines.ise.dom.DOMStream.java
/** * Construct a DOMStream from an input stream and record the source of the * input data.// w ww.j a v a2 s. com * * @param in * @param source * @throws java.io.IOException */ public DOMStream(InputStream in, String source) throws IOException { lines = new ArrayList<>(); boolean warnedSmartQuotes = false; BOMInputStream bomStream = new BOMInputStream(in, ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); bom = bomStream.getBOM(); if (bom != null) { Message m = Message.builder("builder.bom").setSource(source) .addNote("The byte order mark was " + bom.getCharsetName()).build(); Log.addMessage(m); encoding = bom.getCharsetName(); } else { encoding = "UTF-8"; } if (!encoding.equals("UTF-8")) { Message m = Message.builder("builder.notutf8").setSource(source) .addNote("The incorrect encoding is " + encoding).build(); Log.addMessage(m); } BufferedReader buffer = new BufferedReader(new InputStreamReader(bomStream, encoding)); String line; StringBuilder sb = new StringBuilder(); Pattern p = Pattern.compile("\u201C|\u201D"); while ((line = buffer.readLine()) != null) { line = Normalizer.normalize(line, Form.NFKC); Matcher m = p.matcher(line); if (m.find()) { line = m.replaceAll("\""); if (!warnedSmartQuotes) { warnedSmartQuotes = true; Message msg = Message.builder("builder.smartquotes").setSource(source) .addNote("The first occurence of smart quotes was at line " + lines.size()).build(); Log.addMessage(msg); } } lines.add(line); sb.append(line).append("\n"); } content = sb.toString().trim(); }
From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java
@Override public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException { try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)) {/*from ww w . j a va 2s . c om*/ String bom = "UTF-8"; if (bomInputStream.hasBOM()) { bom = bomInputStream.getBOMCharsetName(); } XMLInputFactory xml = XMLInputFactory.newInstance(); XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom); while (reader.hasNext()) { XMLEvent event = reader.nextEvent(); switch (event.getEventType()) { case XMLEvent.START_ELEMENT: StartElement startElement = (StartElement) event; String localPart = startElement.getName().getLocalPart(); if (localPart.equals("xliff")) { @SuppressWarnings("unchecked") Iterator<Attribute> attrs = startElement.getAttributes(); while (attrs.hasNext()) { Attribute attr = attrs.next(); if (isXliffVersionAttributeName(attr.getName())) { String value = attr.getValue(); reader.close(); if ("2.0".equals(value)) { return XLIFFVersion.XLIFF20; } else { return XLIFFVersion.XLIFF12; } } } } break; default: break; } } throw new IllegalStateException("Could not detect XLIFF version"); } }
From source file:net.sf.jmimemagic.detectors.TextFileDetector.java
/** * DOCUMENT ME!//w w w .j a v a 2 s . c om * * @param data DOCUMENT ME! * @param offset DOCUMENT ME! * @param length DOCUMENT ME! * @param bitmask DOCUMENT ME! * @param comparator DOCUMENT ME! * @param mimeType DOCUMENT ME! * @param params DOCUMENT ME! * * @return DOCUMENT ME! */ public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType, Map params) { log.debug("processing stream data"); Perl5Util util = new Perl5Util(); try { BOMInputStream bomIn = new BOMInputStream(new ByteArrayInputStream(data), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); if (bomIn.hasBOM()) { return new String[] { "text/plain" }; } } catch (IOException e) { log.error("TextFileDetector: error detecting byte order mark"); } try { String s = new String(data, "UTF-8"); if (!util.match("/[^[:ascii:][:space:]]/", s)) { return new String[] { "text/plain" }; } } catch (UnsupportedEncodingException e) { log.error("TextFileDetector: failed to process data"); } return null; }
From source file:com.github.anba.test262.environment.RhinoEnv.java
/** * Returns a new {@link Reader} for the {@code stream} parameter *//*from w w w . j av a 2s . co m*/ private static Reader newReader(InputStream stream, String defaultCharset) throws IOException { BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset); return new InputStreamReader(bomstream, charset); }
From source file:com.github.anba.test262.util.Test262Info.java
private static Reader newReader(InputStream stream, String defaultCharset) throws IOException { BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);//from ww w. j ava2 s.co m String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset); return new InputStreamReader(bomstream, charset); }
From source file:com.github.anba.es6draft.chakra.ChakraTest.java
private static Charset charsetFor(BOMInputStream bis, Charset defaultCharset) throws IOException { ByteOrderMark bom = bis.getBOM();//from w ww . j a v a2 s . c o m if (ByteOrderMark.UTF_8.equals(bom)) { return StandardCharsets.UTF_8; } if (ByteOrderMark.UTF_16LE.equals(bom)) { return StandardCharsets.UTF_16LE; } if (ByteOrderMark.UTF_16BE.equals(bom)) { return StandardCharsets.UTF_16BE; } return defaultCharset; }
From source file:net.sourceforge.users.dragomerlin.vcs2icsCalendarConverter.ConvertSingleFile.java
private static BufferedReader detectEncodingAndOpenFile(File inFile) throws IOException { String encodingType = null;//from w w w . ja v a 2 s . co m BufferedReader input = null; BOMInputStream bomIn = null; // Detect file encoding encodingType = TestDetector.main(inFile.getAbsolutePath().toString()); // Entire file reading. FileReader always assumes default encoding is // OK! // We must check for BOM in UTF files and remove them with // org.apache.commons.io.input.BOMInputStream because // java doesn't do that automatically. See Oracle bug 4508058. if (encodingType == null) { // ASCII expected input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile))); } else if (encodingType.startsWith("UTF-8")) { // UTF-8 requires an exclusive call to BOMInputStream bomIn = new BOMInputStream(new FileInputStream(inFile)); input = new BufferedReader(new InputStreamReader(bomIn, encodingType)); if (bomIn.hasBOM()) System.out.println("This file has UTF-8 BOM, removing it"); else System.out.println("This file has UTF-8 without BOM"); } else if (encodingType.startsWith("UTF-")) { // The other UTF cases except UTF-8 bomIn = new BOMInputStream(new FileInputStream(inFile), ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); input = new BufferedReader(new InputStreamReader(bomIn, encodingType)); System.out.println("This file has " + bomIn.getBOMCharsetName() + " BOM, removing it"); } else { // Any other encoding input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), encodingType)); } return input; }
From source file:org.apache.any23.util.StreamUtils.java
public static Document inputStreamToDocument(InputStream is) throws MalformedByteSequenceException { DocumentBuilderFactory factory = null; DocumentBuilder builder = null; Document doc = null;/*from w w w .java 2s . c o m*/ try { factory = DocumentBuilderFactory.newInstance(); builder = factory.newDocumentBuilder(); } catch (ParserConfigurationException e) { logger.error("Error converting InputStream to Document: {}", e); } try { BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE); if (bomIn.hasBOM()) { @SuppressWarnings("unused") int firstNonBOMByte = bomIn.read(); // Skips BOM } doc = builder.parse(bomIn); } catch (SAXException | IOException e) { logger.error("Error converting InputStream to Document: {}", e); } return doc; }
From source file:org.apache.tika.parser.csv.TextAndCSVParserTest.java
@Test public void testCSV_UTF16LE_BOM() throws Exception { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv"); XMLResult xmlResult = getXML(/*from w ww.ja va 2 s .c o m*/ new ByteArrayInputStream(concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER, metadata); assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY)); assertMediaTypeEquals("csv", "UTF-16LE", "comma", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml); }