Example usage for org.apache.commons.io ByteOrderMark UTF

Introduction

In this page you can find the example usage for org.apache.commons.io ByteOrderMark UTF_16BE.

Prototype

ByteOrderMark UTF_16BE

To view the source code for org.apache.commons.io ByteOrderMark UTF_16BE.

Click Source Link

Document

UTF-16BE BOM (Big Endian)

Usage

From source file:com.ibm.wala.cast.ipa.callgraph.CAstCallGraphUtil.java

public static SourceFileModule makeSourceModule(URL script, String scriptName) {
    String hackedName = script.getFile().replaceAll("%5c", "/").replaceAll("%20", " ");

    File scriptFile = new File(hackedName);

    assert hackedName.endsWith(scriptName) : scriptName + " does not match file " + script.getFile();

    return new SourceFileModule(scriptFile, scriptName, null) {
        @Override//from   ww w.  j a v a  2s . c om
        public InputStream getInputStream() {
            BOMInputStream bs = new BOMInputStream(super.getInputStream(), false, ByteOrderMark.UTF_8,
                    ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE,
                    ByteOrderMark.UTF_32BE);
            try {
                if (bs.hasBOM()) {
                    System.err.println("removing BOM " + bs.getBOM());
                }
                return bs;
            } catch (IOException e) {
                return super.getInputStream();
            }
        }
    };
}

From source file:ca.nines.ise.dom.DOMStream.java

/**
 * Construct a DOMStream from an input stream and record the source of the
 * input data.//  www  . j  a va2  s. c om
 *
 * @param in
 * @param source
 * @throws java.io.IOException
 */
public DOMStream(InputStream in, String source) throws IOException {
    lines = new ArrayList<>();
    boolean warnedSmartQuotes = false;

    BOMInputStream bomStream = new BOMInputStream(in, ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE,
            ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
    bom = bomStream.getBOM();
    if (bom != null) {
        Message m = Message.builder("builder.bom").setSource(source)
                .addNote("The byte order mark was " + bom.getCharsetName()).build();
        Log.addMessage(m);
        encoding = bom.getCharsetName();
    } else {
        encoding = "UTF-8";
    }

    if (!encoding.equals("UTF-8")) {
        Message m = Message.builder("builder.notutf8").setSource(source)
                .addNote("The incorrect encoding is " + encoding).build();
        Log.addMessage(m);
    }

    BufferedReader buffer = new BufferedReader(new InputStreamReader(bomStream, encoding));
    String line;
    StringBuilder sb = new StringBuilder();

    Pattern p = Pattern.compile("\u201C|\u201D");

    while ((line = buffer.readLine()) != null) {
        line = Normalizer.normalize(line, Form.NFKC);
        Matcher m = p.matcher(line);
        if (m.find()) {
            line = m.replaceAll("\"");
            if (!warnedSmartQuotes) {
                warnedSmartQuotes = true;
                Message msg = Message.builder("builder.smartquotes").setSource(source)
                        .addNote("The first occurence of smart quotes was at line " + lines.size()).build();
                Log.addMessage(msg);
            }
        }
        lines.add(line);
        sb.append(line).append("\n");
    }

    content = sb.toString().trim();
}

From source file:com.vistatec.ocelot.xliff.okapi.OkapiXLIFFFactory.java

@Override
public XLIFFVersion detectXLIFFVersion(File detectVersion) throws IOException, XMLStreamException {
    try (BOMInputStream bomInputStream = new BOMInputStream(new FileInputStream(detectVersion),
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
            ByteOrderMark.UTF_32LE)) {//from w w  w  .  j  av a  2  s.  c om
        String bom = "UTF-8";
        if (bomInputStream.hasBOM()) {
            bom = bomInputStream.getBOMCharsetName();
        }

        XMLInputFactory xml = XMLInputFactory.newInstance();
        XMLEventReader reader = xml.createXMLEventReader(bomInputStream, bom);
        while (reader.hasNext()) {
            XMLEvent event = reader.nextEvent();
            switch (event.getEventType()) {
            case XMLEvent.START_ELEMENT:
                StartElement startElement = (StartElement) event;
                String localPart = startElement.getName().getLocalPart();
                if (localPart.equals("xliff")) {
                    @SuppressWarnings("unchecked")
                    Iterator<Attribute> attrs = startElement.getAttributes();
                    while (attrs.hasNext()) {
                        Attribute attr = attrs.next();
                        if (isXliffVersionAttributeName(attr.getName())) {
                            String value = attr.getValue();
                            reader.close();
                            if ("2.0".equals(value)) {
                                return XLIFFVersion.XLIFF20;
                            } else {
                                return XLIFFVersion.XLIFF12;
                            }
                        }
                    }
                }
                break;

            default:
                break;
            }
        }
        throw new IllegalStateException("Could not detect XLIFF version");
    }
}

From source file:net.sf.jmimemagic.detectors.TextFileDetector.java

/**
 * DOCUMENT ME!// w w w. ja  v  a2  s .c  o m
 *
 * @param data DOCUMENT ME!
 * @param offset DOCUMENT ME!
 * @param length DOCUMENT ME!
 * @param bitmask DOCUMENT ME!
 * @param comparator DOCUMENT ME!
 * @param mimeType DOCUMENT ME!
 * @param params DOCUMENT ME!
 *
 * @return DOCUMENT ME!
 */
public String[] process(byte[] data, int offset, int length, long bitmask, char comparator, String mimeType,
        Map params) {
    log.debug("processing stream data");

    Perl5Util util = new Perl5Util();

    try {
        BOMInputStream bomIn = new BOMInputStream(new ByteArrayInputStream(data), ByteOrderMark.UTF_8,
                ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
        if (bomIn.hasBOM()) {
            return new String[] { "text/plain" };
        }
    } catch (IOException e) {
        log.error("TextFileDetector: error detecting byte order mark");
    }

    try {
        String s = new String(data, "UTF-8");

        if (!util.match("/[^[:ascii:][:space:]]/", s)) {
            return new String[] { "text/plain" };
        }
    } catch (UnsupportedEncodingException e) {
        log.error("TextFileDetector: failed to process data");
    }

    return null;
}

From source file:com.github.anba.test262.environment.RhinoEnv.java

/**
 * Returns a new {@link Reader} for the {@code stream} parameter
 *//*from w  w w .ja v  a 2s .c o  m*/
private static Reader newReader(InputStream stream, String defaultCharset) throws IOException {
    BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE);
    String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset);
    return new InputStreamReader(bomstream, charset);
}

From source file:com.examples.with.different.packagename.coverage.BOMInputStreamTest.java

public void testReadWithBOM() throws Exception {
    byte[] data = new byte[] { 'A', 'B', 'C' };
    BOMInputStream in = new BOMInputStream(createDataStream(data, true));
    assertEquals('A', in.read());
    assertEquals('B', in.read());
    assertEquals('C', in.read());
    assertEquals(-1, in.read());//  www . j  av  a2s  .co  m
    assertTrue("hasBOM()", in.hasBOM());
    assertTrue("hasBOM(UTF-8)", in.hasBOM(ByteOrderMark.UTF_8));
    assertEquals("getBOM", ByteOrderMark.UTF_8, in.getBOM());
    try {
        in.hasBOM(ByteOrderMark.UTF_16BE);
    } catch (IllegalArgumentException e) {
        // expected - not configured for UTF-16BE
    }
}

From source file:com.github.anba.test262.util.Test262Info.java

private static Reader newReader(InputStream stream, String defaultCharset) throws IOException {
    BOMInputStream bomstream = new BOMInputStream(stream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE);
    String charset = defaultIfNull(bomstream.getBOMCharsetName(), defaultCharset);
    return new InputStreamReader(bomstream, charset);
}

From source file:com.github.anba.es6draft.chakra.ChakraTest.java

private static Charset charsetFor(BOMInputStream bis, Charset defaultCharset) throws IOException {
    ByteOrderMark bom = bis.getBOM();/*  www. j av  a  2 s. c o  m*/
    if (ByteOrderMark.UTF_8.equals(bom)) {
        return StandardCharsets.UTF_8;
    }
    if (ByteOrderMark.UTF_16LE.equals(bom)) {
        return StandardCharsets.UTF_16LE;
    }
    if (ByteOrderMark.UTF_16BE.equals(bom)) {
        return StandardCharsets.UTF_16BE;
    }
    return defaultCharset;
}

From source file:com.examples.with.different.packagename.coverage.BOMInputStreamTest.java

public void testReadWithMultipleBOM() throws Exception {
    byte[] data = new byte[] { 'A', 'B', 'C' };
    BOMInputStream in = new BOMInputStream(createDataStream(data, true), ByteOrderMark.UTF_16BE,
            ByteOrderMark.UTF_8);//from w  ww  .ja va2 s  .  c  o m
    assertEquals('A', in.read());
    assertEquals('B', in.read());
    assertEquals('C', in.read());
    assertEquals(-1, in.read());
    assertTrue("hasBOM()", in.hasBOM());
    assertTrue("hasBOM(UTF-8)", in.hasBOM(ByteOrderMark.UTF_8));
    assertFalse("hasBOM(UTF-16BE)", in.hasBOM(ByteOrderMark.UTF_16BE));
    assertEquals("getBOM", ByteOrderMark.UTF_8, in.getBOM());
}

From source file:net.sourceforge.users.dragomerlin.vcs2icsCalendarConverter.ConvertSingleFile.java

private static BufferedReader detectEncodingAndOpenFile(File inFile) throws IOException {
    String encodingType = null;//from   w w w.jav  a2s.  c om
    BufferedReader input = null;
    BOMInputStream bomIn = null;

    // Detect file encoding
    encodingType = TestDetector.main(inFile.getAbsolutePath().toString());

    // Entire file reading. FileReader always assumes default encoding is
    // OK!
    // We must check for BOM in UTF files and remove them with
    // org.apache.commons.io.input.BOMInputStream because
    // java doesn't do that automatically. See Oracle bug 4508058.
    if (encodingType == null) {
        // ASCII expected
        input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile)));
    } else if (encodingType.startsWith("UTF-8")) {
        // UTF-8 requires an exclusive call to BOMInputStream
        bomIn = new BOMInputStream(new FileInputStream(inFile));
        input = new BufferedReader(new InputStreamReader(bomIn, encodingType));
        if (bomIn.hasBOM())
            System.out.println("This file has UTF-8 BOM, removing it");
        else
            System.out.println("This file has UTF-8 without BOM");
    } else if (encodingType.startsWith("UTF-")) {
        // The other UTF cases except UTF-8
        bomIn = new BOMInputStream(new FileInputStream(inFile), ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
                ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
        input = new BufferedReader(new InputStreamReader(bomIn, encodingType));
        System.out.println("This file has " + bomIn.getBOMCharsetName() + " BOM, removing it");
    } else {
        // Any other encoding
        input = new BufferedReader(new InputStreamReader(new FileInputStream(inFile), encodingType));
    }
    return input;
}

Example usage for org.apache.commons.io ByteOrderMark UTF_16BE

Introduction

Prototype

Document

Usage