List of usage examples for org.apache.commons.io ByteOrderMark UTF_16BE
ByteOrderMark UTF_16BE
To view the source code for org.apache.commons.io ByteOrderMark UTF_16BE.
Click Source Link
From source file:com.ibm.wala.cast.ipa.callgraph.CAstCallGraphUtil.java
public static SourceFileModule makeSourceModule(URL script, String scriptName) { String hackedName = script.getFile().replaceAll("%5c", "/").replaceAll("%20", " "); File scriptFile = new File(hackedName); assert hackedName.endsWith(scriptName) : scriptName + " does not match file " + script.getFile(); return new SourceFileModule(scriptFile, scriptName, null) { @Override//from ww w. j a v a 2s . c om public InputStream getInputStream() { BOMInputStream bs = new BOMInputStream(super.getInputStream(), false, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); try { if (bs.hasBOM()) { System.err.println("removing BOM " + bs.getBOM()); } return bs; } catch (IOException e) { return super.getInputStream(); } } }; }
From source file:ca.nines.ise.dom.DOMStream.java
/** * Construct a DOMStream from an input stream and record the source of the * input data.// www . j a va2 s. c om * * @param in * @param source * @throws java.io.IOException */ public DOMStream(InputStream in, String source) throws IOException { lines = new ArrayList<>(); boolean warnedSmartQuotes = false; BOMInputStream bomStream = new BOMInputStream(in, ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); bom = bomStream.getBOM(); if (bom != null) { Message m = Message.builder("builder.bom").setSource(source) .addNote("The byte order mark was " + bom.getCharsetName()).build(); Log.addMessage(m); encoding = bom.getCharsetName(); } else { encoding = "UTF-8"; } if (!encoding.equals("UTF-8")) { Message m = Message.builder("builder.notutf8").setSource(source) .addNote("The incorrect encoding is " + encoding).build(); Log.addMessage(m); } BufferedReader buffer = new BufferedReader(new InputStreamReader(bomStream, encoding)); String line; StringBuilder sb = new StringBuilder(); Pattern p = Pattern.compile("\u201C|\u201D"); while ((line = buffer.readLine()) != null) { line = Normalizer.normalize(line, Form.NFKC); Matcher m = p.matcher(line); if (m.find()) { line = m.replaceAll("\""); if (!warnedSmartQuotes) { warnedSmartQuotes = true; Message msg = Message.builder("builder.smartquotes").setSource(source) .addNote("The first occurence of smart quotes was at line " + lines.size()).build(); Log.addMessage(msg); } } lines.add(line); sb.append(line).append("\n"); } content = sb.toString().trim(); }
From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java
@Override public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException { try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE)) {//from w w w . j av a 2 s. c om String bom = "UTF-8"; if (bomInputStream.hasBOM()) { bom = bomInputStream.getBOMCharsetName(); } XMLInputFactory xml = XMLInputFactory.newInstance(); XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom); while (reader.hasNext()) { XMLEvent event = reader.nextEvent(); switch (event.getEventType()) { case XMLEvent.START_ELEMENT: StartElement startElement = (StartElement) event; String localPart = startElement.getName().getLocalPart(); if (localPart.equals("xliff")) { @SuppressWarnings("unchecked") Iterator<Attribute> attrs = startElement.getAttributes(); while (attrs.hasNext()) { Attribute attr = attrs.next(); if (isXliffVersionAttributeName(attr.getName())) { String value = attr.getValue(); reader.close(); if ("2.0".equals(value)) { return XLIFFVersion.XLIFF20; } else { return XLIFFVersion.XLIFF12; } } } } break; default: break; } } throw new IllegalStateException("Could not detect XLIFF version"); } }
From source file:net.sf.jmimemagic.detectors.TextFileDetector.java
/** * DOCUMENT ME!// w w w. ja v a2 s .c o m * * @param data DOCUMENT ME! * @param offset DOCUMENT ME! * @param length DOCUMENT ME! * @param bitmask DOCUMENT ME! * @param comparator DOCUMENT ME! * @param mimeType DOCUMENT ME! * @param params DOCUMENT ME! * * @return DOCUMENT ME! */ public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType, Map params) { log.debug("processing stream data"); Perl5Util util = new Perl5Util(); try { BOMInputStream bomIn = new BOMInputStream(new ByteArrayInputStream(data), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); if (bomIn.hasBOM()) { return new String[] { "text/plain" }; } } catch (IOException e) { log.error("TextFileDetector: error detecting byte order mark"); } try { String s = new String(data, "UTF-8"); if (!util.match("/[^[:ascii:][:space:]]/", s)) { return new String[] { "text/plain" }; } } catch (UnsupportedEncodingException e) { log.error("TextFileDetector: failed to process data"); } return null; }
From source file:com.github.anba.test262.environment.RhinoEnv.java
/** * Returns a new {@link Reader} for the {@code stream} parameter *//*from w w w .ja v a 2s .c o m*/ private static Reader newReader(InputStream stream, String defaultCharset) throws IOException { BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset); return new InputStreamReader(bomstream, charset); }
From source file:com.examples.with.different.packagename.coverage.BOMInputStreamTest.java
public void testReadWithBOM() throws Exception { byte[] data = new byte[] { 'A', 'B', 'C' }; BOMInputStream in = new BOMInputStream(createDataStream(data, true)); assertEquals('A', in.read()); assertEquals('B', in.read()); assertEquals('C', in.read()); assertEquals(-1, in.read());// www . j av a2s .co m assertTrue("hasBOM()", in.hasBOM()); assertTrue("hasBOM(UTF-8)", in.hasBOM(ByteOrderMark.UTF_8)); assertEquals("getBOM", ByteOrderMark.UTF_8, in.getBOM()); try { in.hasBOM(ByteOrderMark.UTF_16BE); } catch (IllegalArgumentException e) { // expected - not configured for UTF-16BE } }
From source file:com.github.anba.test262.util.Test262Info.java
private static Reader newReader(InputStream stream, String defaultCharset) throws IOException { BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE); String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset); return new InputStreamReader(bomstream, charset); }
From source file:com.github.anba.es6draft.chakra.ChakraTest.java
private static Charset charsetFor(BOMInputStream bis, Charset defaultCharset) throws IOException { ByteOrderMark bom = bis.getBOM();/* www. j av a 2 s. c o m*/ if (ByteOrderMark.UTF_8.equals(bom)) { return StandardCharsets.UTF_8; } if (ByteOrderMark.UTF_16LE.equals(bom)) { return StandardCharsets.UTF_16LE; } if (ByteOrderMark.UTF_16BE.equals(bom)) { return StandardCharsets.UTF_16BE; } return defaultCharset; }
From source file:com.examples.with.different.packagename.coverage.BOMInputStreamTest.java
public void testReadWithMultipleBOM() throws Exception { byte[] data = new byte[] { 'A', 'B', 'C' }; BOMInputStream in = new BOMInputStream(createDataStream(data, true), ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_8);//from w ww .ja va2 s . c o m assertEquals('A', in.read()); assertEquals('B', in.read()); assertEquals('C', in.read()); assertEquals(-1, in.read()); assertTrue("hasBOM()", in.hasBOM()); assertTrue("hasBOM(UTF-8)", in.hasBOM(ByteOrderMark.UTF_8)); assertFalse("hasBOM(UTF-16BE)", in.hasBOM(ByteOrderMark.UTF_16BE)); assertEquals("getBOM", ByteOrderMark.UTF_8, in.getBOM()); }
From source file:net.sourceforge.users.dragomerlin.vcs2icsCalendarConverter.ConvertSingleFile.java
private static BufferedReader detectEncodingAndOpenFile(File inFile) throws IOException { String encodingType = null;//from w w w.jav a2s. c om BufferedReader input = null; BOMInputStream bomIn = null; // Detect file encoding encodingType = TestDetector.main(inFile.getAbsolutePath().toString()); // Entire file reading. FileReader always assumes default encoding is // OK! // We must check for BOM in UTF files and remove them with // org.apache.commons.io.input.BOMInputStream because // java doesn't do that automatically. See Oracle bug 4508058. if (encodingType == null) { // ASCII expected input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile))); } else if (encodingType.startsWith("UTF-8")) { // UTF-8 requires an exclusive call to BOMInputStream bomIn = new BOMInputStream(new FileInputStream(inFile)); input = new BufferedReader(new InputStreamReader(bomIn, encodingType)); if (bomIn.hasBOM()) System.out.println("This file has UTF-8 BOM, removing it"); else System.out.println("This file has UTF-8 without BOM"); } else if (encodingType.startsWith("UTF-")) { // The other UTF cases except UTF-8 bomIn = new BOMInputStream(new FileInputStream(inFile), ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); input = new BufferedReader(new InputStreamReader(bomIn, encodingType)); System.out.println("This file has " + bomIn.getBOMCharsetName() + " BOM, removing it"); } else { // Any other encoding input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), encodingType)); } return input; }