Example usage for org.apache.commons.io ByteOrderMark UTF_16LE

List of usage examples for org.apache.commons.io ByteOrderMark UTF_16LE

Introduction

In this page you can find the example usage for org.apache.commons.io ByteOrderMark UTF_16LE.

Prototype

ByteOrderMark UTF_16LE

To view the source code for org.apache.commons.io ByteOrderMark UTF_16LE.

Click Source Link

Document

UTF-16LE BOM (Little Endian)

Usage

From source file:com.ibm.wala.cast.ipa.callgraph.CAstCallGraphUtil.java

public static SourceFileModule makeSourceModule(URL script, String scriptName) {
    String hackedName = script.getFile().replaceAll("%5c", "/").replaceAll("%20", " ");

    File scriptFile = new File(hackedName);

    assert hackedName.endsWith(scriptName) : scriptName + " does not match file " + script.getFile();

    return new SourceFileModule(scriptFile, scriptName, null) {
        @Override//from   www.j a  va 2 s.co m
        public InputStream getInputStream() {
            BOMInputStream bs = new BOMInputStream(super.getInputStream(), false, ByteOrderMark.UTF_8,
                    ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE,
                    ByteOrderMark.UTF_32BE);
            try {
                if (bs.hasBOM()) {
                    System.err.println("removing BOM " + bs.getBOM());
                }
                return bs;
            } catch (IOException e) {
                return super.getInputStream();
            }
        }
    };
}

From source file:ca.nines.ise.dom.DOMStream.java

/**
 * Construct a DOMStream from an input stream and record the source of the
 * input data.//  w ww.j a  v a2 s. com
 *
 * @param in
 * @param source
 * @throws java.io.IOException
 */
public DOMStream(InputStream in, String source) throws IOException {
    lines = new ArrayList<>();
    boolean warnedSmartQuotes = false;

    BOMInputStream bomStream = new BOMInputStream(in, ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE,
            ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
    bom = bomStream.getBOM();
    if (bom != null) {
        Message m = Message.builder("builder.bom").setSource(source)
                .addNote("The byte order mark was " + bom.getCharsetName()).build();
        Log.addMessage(m);
        encoding = bom.getCharsetName();
    } else {
        encoding = "UTF-8";
    }

    if (!encoding.equals("UTF-8")) {
        Message m = Message.builder("builder.notutf8").setSource(source)
                .addNote("The incorrect encoding is " + encoding).build();
        Log.addMessage(m);
    }

    BufferedReader buffer = new BufferedReader(new InputStreamReader(bomStream, encoding));
    String line;
    StringBuilder sb = new StringBuilder();

    Pattern p = Pattern.compile("\u201C|\u201D");

    while ((line = buffer.readLine()) != null) {
        line = Normalizer.normalize(line, Form.NFKC);
        Matcher m = p.matcher(line);
        if (m.find()) {
            line = m.replaceAll("\"");
            if (!warnedSmartQuotes) {
                warnedSmartQuotes = true;
                Message msg = Message.builder("builder.smartquotes").setSource(source)
                        .addNote("The first occurence of smart quotes was at line " + lines.size()).build();
                Log.addMessage(msg);
            }
        }
        lines.add(line);
        sb.append(line).append("\n");
    }

    content = sb.toString().trim();
}

From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java

@Override
public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException {
    try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion),
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
            ByteOrderMark.UTF_32LE)) {/*from  ww w  .  j  a va 2s . c om*/
        String bom = "UTF-8";
        if (bomInputStream.hasBOM()) {
            bom = bomInputStream.getBOMCharsetName();
        }

        XMLInputFactory xml = XMLInputFactory.newInstance();
        XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom);
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            switch (event.getEventType()) {
            case XMLEvent.START_ELEMENT:
                StartElement startElement = (StartElement) event;
                String localPart = startElement.getName().getLocalPart();
                if (localPart.equals("xliff")) {
                    @SuppressWarnings("unchecked")
                    Iterator<Attribute> attrs = startElement.getAttributes();
                    while (attrs.hasNext()) {
                        Attribute attr = attrs.next();
                        if (isXliffVersionAttributeName(attr.getName())) {
                            String value = attr.getValue();
                            reader.close();
                            if ("2.0".equals(value)) {
                                return XLIFFVersion.XLIFF20;
                            } else {
                                return XLIFFVersion.XLIFF12;
                            }
                        }
                    }
                }
                break;

            default:
                break;
            }
        }
        throw new IllegalStateException("Could not detect XLIFF version");
    }
}

From source file:net.sf.jmimemagic.detectors.TextFileDetector.java

/**
 * DOCUMENT ME!//w  w w  .j a  v  a 2 s . c  om
 *
 * @param data DOCUMENT ME!
 * @param offset DOCUMENT ME!
 * @param length DOCUMENT ME!
 * @param bitmask DOCUMENT ME!
 * @param comparator DOCUMENT ME!
 * @param mimeType DOCUMENT ME!
 * @param params DOCUMENT ME!
 *
 * @return DOCUMENT ME!
 */
public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType,
        Map params) {
    log.debug("processing stream data");

    Perl5Util util = new Perl5Util();

    try {
        BOMInputStream bomIn = new BOMInputStream(new ByteArrayInputStream(data), ByteOrderMark.UTF_8,
                ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
        if (bomIn.hasBOM()) {
            return new String[] { "text/plain" };
        }
    } catch (IOException e) {
        log.error("TextFileDetector: error detecting byte order mark");
    }

    try {
        String s = new String(data, "UTF-8");

        if (!util.match("/[^[:ascii:][:space:]]/", s)) {
            return new String[] { "text/plain" };
        }
    } catch (UnsupportedEncodingException e) {
        log.error("TextFileDetector: failed to process data");
    }

    return null;
}

From source file:com.github.anba.test262.environment.RhinoEnv.java

/**
 * Returns a new {@link Reader} for the {@code stream} parameter
 *//*from  w w  w  . j av a 2s .  co  m*/
private static Reader newReader(InputStream stream, String defaultCharset) throws IOException {
    BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE);
    String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset);
    return new InputStreamReader(bomstream, charset);
}

From source file:com.github.anba.test262.util.Test262Info.java

private static Reader newReader(InputStream stream, String defaultCharset) throws IOException {
    BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE);//from ww w. j ava2 s.co m
    String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset);
    return new InputStreamReader(bomstream, charset);
}

From source file:com.github.anba.es6draft.chakra.ChakraTest.java

private static Charset charsetFor(BOMInputStream bis, Charset defaultCharset) throws IOException {
    ByteOrderMark bom = bis.getBOM();//from   w  ww . j  a v a2  s  .  c o  m
    if (ByteOrderMark.UTF_8.equals(bom)) {
        return StandardCharsets.UTF_8;
    }
    if (ByteOrderMark.UTF_16LE.equals(bom)) {
        return StandardCharsets.UTF_16LE;
    }
    if (ByteOrderMark.UTF_16BE.equals(bom)) {
        return StandardCharsets.UTF_16BE;
    }
    return defaultCharset;
}

From source file:net.sourceforge.users.dragomerlin.vcs2icsCalendarConverter.ConvertSingleFile.java

private static BufferedReader detectEncodingAndOpenFile(File inFile) throws IOException {
    String encodingType = null;//from w w w  . ja  v  a 2 s  .  co  m
    BufferedReader input = null;
    BOMInputStream bomIn = null;

    // Detect file encoding
    encodingType = TestDetector.main(inFile.getAbsolutePath().toString());

    // Entire file reading. FileReader always assumes default encoding is
    // OK!
    // We must check for BOM in UTF files and remove them with
    // org.apache.commons.io.input.BOMInputStream because
    // java doesn't do that automatically. See Oracle bug 4508058.
    if (encodingType == null) {
        // ASCII expected
        input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile)));
    } else if (encodingType.startsWith("UTF-8")) {
        // UTF-8 requires an exclusive call to BOMInputStream
        bomIn = new BOMInputStream(new FileInputStream(inFile));
        input = new BufferedReader(new InputStreamReader(bomIn, encodingType));
        if (bomIn.hasBOM())
            System.out.println("This file has UTF-8 BOM, removing it");
        else
            System.out.println("This file has UTF-8 without BOM");
    } else if (encodingType.startsWith("UTF-")) {
        // The other UTF cases except UTF-8
        bomIn = new BOMInputStream(new FileInputStream(inFile), ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
                ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
        input = new BufferedReader(new InputStreamReader(bomIn, encodingType));
        System.out.println("This file has " + bomIn.getBOMCharsetName() + " BOM, removing it");
    } else {
        // Any other encoding
        input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), encodingType));
    }
    return input;
}

From source file:org.apache.any23.util.StreamUtils.java

public static Document inputStreamToDocument(InputStream is) throws MalformedByteSequenceException {
    DocumentBuilderFactory factory = null;
    DocumentBuilder builder = null;
    Document doc = null;/*from  w  w  w  .java  2s .  c  o m*/

    try {
        factory = DocumentBuilderFactory.newInstance();
        builder = factory.newDocumentBuilder();
    } catch (ParserConfigurationException e) {
        logger.error("Error converting InputStream to Document: {}", e);
    }

    try {
        BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE,
                ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE);
        if (bomIn.hasBOM()) {
            @SuppressWarnings("unused")
            int firstNonBOMByte = bomIn.read(); // Skips BOM
        }
        doc = builder.parse(bomIn);
    } catch (SAXException | IOException e) {
        logger.error("Error converting InputStream to Document: {}", e);
    }
    return doc;
}

From source file:org.apache.tika.parser.csv.TextAndCSVParserTest.java

@Test
public void testCSV_UTF16LE_BOM() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
    XMLResult xmlResult = getXML(/*from   w ww.ja  va 2 s .c o  m*/
            new ByteArrayInputStream(concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER,
            metadata);
    assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
    assertMediaTypeEquals("csv", "UTF-16LE", "comma", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
    assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}