Java tutorial
/* * Copyright (C) 2017 - present by OpenGamma Inc. and the OpenGamma group of companies * * Please see distribution for license. */ package com.opengamma.strata.collect.io; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackInputStream; import java.io.Reader; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import com.google.common.io.ByteSource; import com.google.common.io.ByteStreams; import com.google.common.io.CharSource; /** * Utilities that allow code to use the Unicode Byte Order Mark. * <p> * A Unicode file may contain a Byte Order Mark (BOM) that specifies which * encoding is used. Sadly, neither the JDK nor Guava handle this properly. * <p> * This class supports the BOM for UTF-8, UTF-16LE and UTF-16BE. * The UTF-32 formats are rarely seen and cannot be easily determined as * the UTF-32 BOMs are similar to the UTF-16 BOMs. */ public final class UnicodeBom { private static final byte X_FE = (byte) 0xFE; private static final byte X_EF = (byte) 0xEF; private static final byte X_FF = (byte) 0xFF; private static final byte X_BF = (byte) 0xBF; private static final byte X_BB = (byte) 0xBB; /** * Restricted constructor. */ private UnicodeBom() { } //------------------------------------------------------------------------- /** * Converts a {@code byte[]} to a {@code String}. * <p> * This ensures that any Unicode byte order marker is used correctly. * The default encoding is UTF-8 if no BOM is found. * * @param input the input byte array * @return the equivalent string */ public static String toString(byte[] input) { if (input.length >= 3 && input[0] == X_EF && input[1] == X_BB && input[2] == X_BF) { return new String(input, 3, input.length - 3, StandardCharsets.UTF_8); } else if (input.length >= 2 && input[0] == X_FE && input[1] == X_FF) { return new String(input, 2, input.length - 2, StandardCharsets.UTF_16BE); } else if (input.length >= 2 && input[0] == X_FF && input[1] == X_FE) { return new String(input, 2, input.length - 2, StandardCharsets.UTF_16LE); } else { return new String(input, StandardCharsets.UTF_8); } } /** * Converts a {@code ByteSource} to a {@code CharSource}. * <p> * This ensures that any Unicode byte order marker is used correctly. * The default encoding is UTF-8 if no BOM is found. * * @param byteSource the byte source * @return the char source, that uses the BOM to determine the encoding */ public static CharSource toCharSource(ByteSource byteSource) { return new CharSource() { @Override public ByteSource asByteSource(Charset charset) { return byteSource; } @Override public Reader openStream() throws IOException { return toReader(byteSource.openStream()); } @Override public String toString() { return "UnicodeBom.toCharSource(" + byteSource.toString() + ")"; } }; } /** * Converts an {@code InputStream} to a {@code Reader}. * <p> * This ensures that any Unicode byte order marker is used correctly. * The default encoding is UTF-8 if no BOM is found. * * @param inputStream the input stream to wrap * @return the reader, that uses the BOM to determine the encoding * @throws IOException if an IO error occurs */ public static Reader toReader(InputStream inputStream) throws IOException { return new BomReader(inputStream); } //------------------------------------------------------------------------- /** * Reader that manages the BOM. */ private static final class BomReader extends Reader { private static final int MAX_BOM_SIZE = 4; private final InputStreamReader underlying; BomReader(InputStream inputStream) throws IOException { super(inputStream); Charset encoding; byte[] bom = new byte[MAX_BOM_SIZE]; // read first 3 bytes such that they can be pushed back later PushbackInputStream pushbackStream = new PushbackInputStream(inputStream, MAX_BOM_SIZE); int bytesRead = ByteStreams.read(pushbackStream, bom, 0, 3); // look for BOM and adapt, defauling to UTF-8 if (bytesRead >= 3 && bom[0] == X_EF && bom[1] == X_BB && bom[2] == X_BF) { encoding = StandardCharsets.UTF_8; pushbackStream.unread(bom, 3, (bytesRead - 3)); } else if (bytesRead >= 2 && bom[0] == X_FE && bom[1] == X_FF) { encoding = StandardCharsets.UTF_16BE; pushbackStream.unread(bom, 2, (bytesRead - 2)); } else if (bytesRead >= 2 && bom[0] == X_FF && bom[1] == X_FE) { encoding = StandardCharsets.UTF_16LE; pushbackStream.unread(bom, 2, (bytesRead - 2)); } else { encoding = StandardCharsets.UTF_8; pushbackStream.unread(bom, 0, bytesRead); } // use Java standard code now we know the encoding this.underlying = new InputStreamReader(pushbackStream, encoding); } @Override public int read(CharBuffer target) throws IOException { return underlying.read(target); } @Override public int read() throws IOException { return underlying.read(); } @Override public int read(char[] cbuf) throws IOException { return underlying.read(cbuf); } @Override public int read(char[] cbuf, int off, int len) throws IOException { return underlying.read(cbuf, off, len); } @Override public long skip(long n) throws IOException { return underlying.skip(n); } @Override public boolean ready() throws IOException { return underlying.ready(); } @Override public boolean markSupported() { return underlying.markSupported(); } @Override public void mark(int readAheadLimit) throws IOException { underlying.mark(readAheadLimit); } @Override public void reset() throws IOException { underlying.reset(); } @Override public void close() throws IOException { underlying.close(); } } }