org.antlr.v4.runtime.CharStreams.java Source code

Introduction

Here is the source code for org.antlr.v4.runtime.CharStreams.java
Source

/*
 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

package org.antlr.v4.runtime;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

/** This class represents the primary interface for creating {@link CharStream}s
 *  from a variety of sources as of 4.7.  The motivation was to support
 *  Unicode code points > U+FFFF.  {@link ANTLRInputStream} and
 *  {@link ANTLRFileStream} are now deprecated in favor of the streams created
 *  by this interface.
 *
 *  DEPRECATED: {@code new ANTLRFileStream("myinputfile")}
 *  NEW:        {@code CharStreams.fromFileName("myinputfile")}
 *
 *  WARNING: If you use both the deprecated and the new streams, you will see
 *  a nontrivial performance degradation. This speed hit is because the
 *  {@link Lexer}'s internal code goes from a monomorphic to megamorphic
 *  dynamic dispatch to get characters from the input stream. Java's
 *  on-the-fly compiler (JIT) is unable to perform the same optimizations
 *  so stick with either the old or the new streams, if performance is
 *  a primary concern. See the extreme debugging and spelunking
 *  needed to identify this issue in our timing rig:
 *
 *      https://github.com/antlr/antlr4/pull/1781
 *
 *  The ANTLR character streams still buffer all the input when you create
 *  the stream, as they have done for ~20 years. If you need unbuffered
 *  access, please note that it becomes challenging to create
 *  parse trees. The parse tree has to point to tokens which will either
 *  point into a stale location in an unbuffered stream or you have to copy
 *  the characters out of the buffer into the token. That defeats the purpose
 *  of unbuffered input. Per the ANTLR book, unbuffered streams are primarily
 *  useful for processing infinite streams *during the parse.*
 *
 *  The new streams also use 8-bit buffers when possible so this new
 *  interface supports character streams that use half as much memory
 *  as the old {@link ANTLRFileStream}, which assumed 16-bit characters.
 *
 *  A big shout out to Ben Hamilton (github bhamiltoncx) for his superhuman
 *  efforts across all targets to get true Unicode 3.1 support for U+10FFFF.
 *
 *  @since 4.7
 */
public final class CharStreams {
    private static final int DEFAULT_BUFFER_SIZE = 4096;

    // Utility class; do not construct.
    private CharStreams() {
    }

    /**
     * Creates a {@link CharStream} given a path to a UTF-8
     * encoded file on disk.
     *
     * Reads the entire contents of the file into the result before returning.
     */
    public static CharStream fromPath(Path path) throws IOException {
        return fromPath(path, StandardCharsets.UTF_8);
    }

    /**
     * Creates a {@link CharStream} given a path to a file on disk and the
     * charset of the bytes contained in the file.
     *
     * Reads the entire contents of the file into the result before returning.
     */
    public static CharStream fromPath(Path path, Charset charset) throws IOException {
        long size = Files.size(path);
        try (ReadableByteChannel channel = Files.newByteChannel(path)) {
            return fromChannel(channel, charset, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, path.toString(),
                    size);
        }
    }

    /**
     * Creates a {@link CharStream} given a string containing a
     * path to a UTF-8 file on disk.
     *
     * Reads the entire contents of the file into the result before returning.
     */
    public static CharStream fromFileName(String fileName) throws IOException {
        return fromPath(Paths.get(fileName), StandardCharsets.UTF_8);
    }

    /**
     * Creates a {@link CharStream} given a string containing a
     * path to a file on disk and the charset of the bytes
     * contained in the file.
     *
     * Reads the entire contents of the file into the result before returning.
     */
    public static CharStream fromFileName(String fileName, Charset charset) throws IOException {
        return fromPath(Paths.get(fileName), charset);
    }

    /**
     * Creates a {@link CharStream} given an opened {@link InputStream}
     * containing UTF-8 bytes.
     *
     * Reads the entire contents of the {@code InputStream} into
     * the result before returning, then closes the {@code InputStream}.
     */
    public static CharStream fromStream(InputStream is) throws IOException {
        return fromStream(is, StandardCharsets.UTF_8);
    }

    /**
     * Creates a {@link CharStream} given an opened {@link InputStream} and the
     * charset of the bytes contained in the stream.
     *
     * Reads the entire contents of the {@code InputStream} into
     * the result before returning, then closes the {@code InputStream}.
     */
    public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
        return fromStream(is, charset, -1);
    }

    public static CharStream fromStream(InputStream is, Charset charset, long inputSize) throws IOException {
        try (ReadableByteChannel channel = Channels.newChannel(is)) {
            return fromChannel(channel, charset, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE,
                    IntStream.UNKNOWN_SOURCE_NAME, inputSize);
        }
    }

    /**
     * Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
     * containing UTF-8 bytes.
     *
     * Reads the entire contents of the {@code channel} into
     * the result before returning, then closes the {@code channel}.
     */
    public static CharStream fromChannel(ReadableByteChannel channel) throws IOException {
        return fromChannel(channel, StandardCharsets.UTF_8);
    }

    /**
     * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the
     * charset of the bytes contained in the channel.
     *
     * Reads the entire contents of the {@code channel} into
     * the result before returning, then closes the {@code channel}.
     */
    public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
        return fromChannel(channel, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, IntStream.UNKNOWN_SOURCE_NAME);
    }

    /**
     * Creates a {@link CharStream} given a {@link Reader}. Closes
     * the reader before returning.
     */
    public static CodePointCharStream fromReader(Reader r) throws IOException {
        return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME);
    }

    /**
     * Creates a {@link CharStream} given a {@link Reader} and its
     * source name. Closes the reader before returning.
     */
    public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
        try {
            CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(DEFAULT_BUFFER_SIZE);
            CharBuffer charBuffer = CharBuffer.allocate(DEFAULT_BUFFER_SIZE);
            while ((r.read(charBuffer)) != -1) {
                charBuffer.flip();
                codePointBufferBuilder.append(charBuffer);
                charBuffer.compact();
            }
            return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
        } finally {
            r.close();
        }
    }

    /**
     * Creates a {@link CharStream} given a {@link String}.
     */
    public static CodePointCharStream fromString(String s) {
        return fromString(s, IntStream.UNKNOWN_SOURCE_NAME);
    }

    /**
     * Creates a {@link CharStream} given a {@link String} and the {@code sourceName}
     * from which it came.
     */
    public static CodePointCharStream fromString(String s, String sourceName) {
        // Initial guess assumes no code points > U+FFFF: one code
        // point for each code unit in the string
        CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(s.length());
        // TODO: CharBuffer.wrap(String) rightfully returns a read-only buffer
        // which doesn't expose its array, so we make a copy.
        CharBuffer cb = CharBuffer.allocate(s.length());
        cb.put(s);
        cb.flip();
        codePointBufferBuilder.append(cb);
        return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
    }

    /**
     * Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
     * containing UTF-8 bytes.
     *
     * Reads the entire contents of the {@code channel} into
     * the result before returning, then closes the {@code channel}.
     */
    public static CodePointCharStream fromChannel(ReadableByteChannel channel, int bufferSize,
            CodingErrorAction decodingErrorAction, String sourceName) throws IOException {
        return fromChannel(channel, StandardCharsets.UTF_8, bufferSize, decodingErrorAction, sourceName, -1);
    }

    public static CodePointCharStream fromChannel(ReadableByteChannel channel, Charset charset, int bufferSize,
            CodingErrorAction decodingErrorAction, String sourceName, long inputSize) throws IOException {
        try {
            ByteBuffer utf8BytesIn = ByteBuffer.allocate(bufferSize);
            CharBuffer utf16CodeUnitsOut = CharBuffer.allocate(bufferSize);
            if (inputSize == -1) {
                inputSize = bufferSize;
            } else if (inputSize > Integer.MAX_VALUE) {
                // ByteBuffer et al don't support long sizes
                throw new IOException(
                        String.format("inputSize %d larger than max %d", inputSize, Integer.MAX_VALUE));
            }
            CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder((int) inputSize);
            CharsetDecoder decoder = charset.newDecoder().onMalformedInput(decodingErrorAction)
                    .onUnmappableCharacter(decodingErrorAction);

            boolean endOfInput = false;
            while (!endOfInput) {
                int bytesRead = channel.read(utf8BytesIn);
                endOfInput = (bytesRead == -1);
                utf8BytesIn.flip();
                CoderResult result = decoder.decode(utf8BytesIn, utf16CodeUnitsOut, endOfInput);
                if (result.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
                    result.throwException();
                }
                utf16CodeUnitsOut.flip();
                codePointBufferBuilder.append(utf16CodeUnitsOut);
                utf8BytesIn.compact();
                utf16CodeUnitsOut.compact();
            }
            // Handle any bytes at the end of the file which need to
            // be represented as errors or substitution characters.
            CoderResult flushResult = decoder.flush(utf16CodeUnitsOut);
            if (flushResult.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
                flushResult.throwException();
            }
            utf16CodeUnitsOut.flip();
            codePointBufferBuilder.append(utf16CodeUnitsOut);

            CodePointBuffer codePointBuffer = codePointBufferBuilder.build();
            return CodePointCharStream.fromBuffer(codePointBuffer, sourceName);
        } finally {
            channel.close();
        }
    }
}