Java tutorial
/* * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ package org.antlr.v4.runtime; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; /** This class represents the primary interface for creating {@link CharStream}s * from a variety of sources as of 4.7. The motivation was to support * Unicode code points > U+FFFF. {@link ANTLRInputStream} and * {@link ANTLRFileStream} are now deprecated in favor of the streams created * by this interface. * * DEPRECATED: {@code new ANTLRFileStream("myinputfile")} * NEW: {@code CharStreams.fromFileName("myinputfile")} * * WARNING: If you use both the deprecated and the new streams, you will see * a nontrivial performance degradation. This speed hit is because the * {@link Lexer}'s internal code goes from a monomorphic to megamorphic * dynamic dispatch to get characters from the input stream. Java's * on-the-fly compiler (JIT) is unable to perform the same optimizations * so stick with either the old or the new streams, if performance is * a primary concern. See the extreme debugging and spelunking * needed to identify this issue in our timing rig: * * https://github.com/antlr/antlr4/pull/1781 * * The ANTLR character streams still buffer all the input when you create * the stream, as they have done for ~20 years. If you need unbuffered * access, please note that it becomes challenging to create * parse trees. The parse tree has to point to tokens which will either * point into a stale location in an unbuffered stream or you have to copy * the characters out of the buffer into the token. That defeats the purpose * of unbuffered input. Per the ANTLR book, unbuffered streams are primarily * useful for processing infinite streams *during the parse.* * * The new streams also use 8-bit buffers when possible so this new * interface supports character streams that use half as much memory * as the old {@link ANTLRFileStream}, which assumed 16-bit characters. * * A big shout out to Ben Hamilton (github bhamiltoncx) for his superhuman * efforts across all targets to get true Unicode 3.1 support for U+10FFFF. * * @since 4.7 */ public final class CharStreams { private static final int DEFAULT_BUFFER_SIZE = 4096; // Utility class; do not construct. private CharStreams() { } /** * Creates a {@link CharStream} given a path to a UTF-8 * encoded file on disk. * * Reads the entire contents of the file into the result before returning. */ public static CharStream fromPath(Path path) throws IOException { return fromPath(path, StandardCharsets.UTF_8); } /** * Creates a {@link CharStream} given a path to a file on disk and the * charset of the bytes contained in the file. * * Reads the entire contents of the file into the result before returning. */ public static CharStream fromPath(Path path, Charset charset) throws IOException { long size = Files.size(path); try (ReadableByteChannel channel = Files.newByteChannel(path)) { return fromChannel(channel, charset, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, path.toString(), size); } } /** * Creates a {@link CharStream} given a string containing a * path to a UTF-8 file on disk. * * Reads the entire contents of the file into the result before returning. */ public static CharStream fromFileName(String fileName) throws IOException { return fromPath(Paths.get(fileName), StandardCharsets.UTF_8); } /** * Creates a {@link CharStream} given a string containing a * path to a file on disk and the charset of the bytes * contained in the file. * * Reads the entire contents of the file into the result before returning. */ public static CharStream fromFileName(String fileName, Charset charset) throws IOException { return fromPath(Paths.get(fileName), charset); } /** * Creates a {@link CharStream} given an opened {@link InputStream} * containing UTF-8 bytes. * * Reads the entire contents of the {@code InputStream} into * the result before returning, then closes the {@code InputStream}. */ public static CharStream fromStream(InputStream is) throws IOException { return fromStream(is, StandardCharsets.UTF_8); } /** * Creates a {@link CharStream} given an opened {@link InputStream} and the * charset of the bytes contained in the stream. * * Reads the entire contents of the {@code InputStream} into * the result before returning, then closes the {@code InputStream}. */ public static CharStream fromStream(InputStream is, Charset charset) throws IOException { return fromStream(is, charset, -1); } public static CharStream fromStream(InputStream is, Charset charset, long inputSize) throws IOException { try (ReadableByteChannel channel = Channels.newChannel(is)) { return fromChannel(channel, charset, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, IntStream.UNKNOWN_SOURCE_NAME, inputSize); } } /** * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} * containing UTF-8 bytes. * * Reads the entire contents of the {@code channel} into * the result before returning, then closes the {@code channel}. */ public static CharStream fromChannel(ReadableByteChannel channel) throws IOException { return fromChannel(channel, StandardCharsets.UTF_8); } /** * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the * charset of the bytes contained in the channel. * * Reads the entire contents of the {@code channel} into * the result before returning, then closes the {@code channel}. */ public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException { return fromChannel(channel, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, IntStream.UNKNOWN_SOURCE_NAME); } /** * Creates a {@link CharStream} given a {@link Reader}. Closes * the reader before returning. */ public static CodePointCharStream fromReader(Reader r) throws IOException { return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME); } /** * Creates a {@link CharStream} given a {@link Reader} and its * source name. Closes the reader before returning. */ public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException { try { CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(DEFAULT_BUFFER_SIZE); CharBuffer charBuffer = CharBuffer.allocate(DEFAULT_BUFFER_SIZE); while ((r.read(charBuffer)) != -1) { charBuffer.flip(); codePointBufferBuilder.append(charBuffer); charBuffer.compact(); } return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName); } finally { r.close(); } } /** * Creates a {@link CharStream} given a {@link String}. */ public static CodePointCharStream fromString(String s) { return fromString(s, IntStream.UNKNOWN_SOURCE_NAME); } /** * Creates a {@link CharStream} given a {@link String} and the {@code sourceName} * from which it came. */ public static CodePointCharStream fromString(String s, String sourceName) { // Initial guess assumes no code points > U+FFFF: one code // point for each code unit in the string CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(s.length()); // TODO: CharBuffer.wrap(String) rightfully returns a read-only buffer // which doesn't expose its array, so we make a copy. CharBuffer cb = CharBuffer.allocate(s.length()); cb.put(s); cb.flip(); codePointBufferBuilder.append(cb); return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName); } /** * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} * containing UTF-8 bytes. * * Reads the entire contents of the {@code channel} into * the result before returning, then closes the {@code channel}. */ public static CodePointCharStream fromChannel(ReadableByteChannel channel, int bufferSize, CodingErrorAction decodingErrorAction, String sourceName) throws IOException { return fromChannel(channel, StandardCharsets.UTF_8, bufferSize, decodingErrorAction, sourceName, -1); } public static CodePointCharStream fromChannel(ReadableByteChannel channel, Charset charset, int bufferSize, CodingErrorAction decodingErrorAction, String sourceName, long inputSize) throws IOException { try { ByteBuffer utf8BytesIn = ByteBuffer.allocate(bufferSize); CharBuffer utf16CodeUnitsOut = CharBuffer.allocate(bufferSize); if (inputSize == -1) { inputSize = bufferSize; } else if (inputSize > Integer.MAX_VALUE) { // ByteBuffer et al don't support long sizes throw new IOException( String.format("inputSize %d larger than max %d", inputSize, Integer.MAX_VALUE)); } CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder((int) inputSize); CharsetDecoder decoder = charset.newDecoder().onMalformedInput(decodingErrorAction) .onUnmappableCharacter(decodingErrorAction); boolean endOfInput = false; while (!endOfInput) { int bytesRead = channel.read(utf8BytesIn); endOfInput = (bytesRead == -1); utf8BytesIn.flip(); CoderResult result = decoder.decode(utf8BytesIn, utf16CodeUnitsOut, endOfInput); if (result.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) { result.throwException(); } utf16CodeUnitsOut.flip(); codePointBufferBuilder.append(utf16CodeUnitsOut); utf8BytesIn.compact(); utf16CodeUnitsOut.compact(); } // Handle any bytes at the end of the file which need to // be represented as errors or substitution characters. CoderResult flushResult = decoder.flush(utf16CodeUnitsOut); if (flushResult.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) { flushResult.throwException(); } utf16CodeUnitsOut.flip(); codePointBufferBuilder.append(utf16CodeUnitsOut); CodePointBuffer codePointBuffer = codePointBufferBuilder.build(); return CodePointCharStream.fromBuffer(codePointBuffer, sourceName); } finally { channel.close(); } } }