Source code

Java tutorial


Here is the source code for


/* $Id:,v 1.3 2007/12/04 13:22:01 mke Exp $
 * $Revision: 1.3 $
 * $Date: 2007/12/04 13:22:01 $
 * $Author: mke $
 * The SB Util Library.
 * Copyright (C) 2005-2007  The State and University Library of Denmark
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * Lesser General Public License for more details.
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * The State and University Library of Denmark
 * CVS:  $Id:,v 1.3 2007/12/04 13:22:01 mke Exp $
package dk.statsbiblioteket.util;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Comparator;

 * A Java NIO based high-performance, large file-size enabled, random seek
 * capable line reader. Use only for good.
 * The reader assumes UTF-8 encoding when performing String-related operations.
 * It is substantially faster than {@link RandomAccessFile} (about a factor 5
 * for most operations). It can be used as a replacement for RandomAccessFile.
 * Important: writeUTF is not supported. This is because the relevant converter
 * method {@link DataOutputStream#writeUTF(String, DataOutput)} is
 * package private.
 * This class is not synchronised.
@QAInfo(state = QAInfo.State.QA_NEEDED, level = QAInfo.Level.NORMAL)
public class LineReader implements DataInput, DataOutput {
    private static Log log = LogFactory.getLog(LineReader.class);

    protected static final int BUFFER_SIZE = 8192; // TODO: Performance-tweak this

    private RandomAccessFile input;
    //    private FileInputStream input;
     * The channel that controls reads.
    private FileChannel channelIn;
     * Indicates whether the {@link #channelIn} is opened. All reading methods
     * automatically opens channelIn, if inOpen is false.
    private boolean inOpen = false;

    private RandomAccessFile output;
    //    private FileOutputStream output;
     * The channel that controls writes.
    private FileChannel channelOut;
     * Indicates whether the {@link #channelOut} is opened. All write methods
     * automatically opens channelOut, if outOpen is false.
    private boolean outOpen = false;

     * The buffer containes cached bytes, either read from the file or added by
     * write-calls.
    private ByteBuffer buffer;
     * The absolute position of the beginning of the buffer.
    private long bufferStart = -1;
     * The highest positined byte in the buffer that has been changed and waits
     * for {@link #flush}.
    private int maxBufferPos = 0;

     * The File that the Linereader works on.
    private File file;

     * The current position in the {@link #file}. Reads and writes will occur
     * from this position and forward.
     * @see #getPosition().
     * @see #seek(long).
    private long position = 0;

     * Stated whether the buffer has been changed and needs to be flushed.
    private boolean dirty = false;
     * The size of the file. Cached to avoid making system-calls for each
     * request for file size.
     * @see #length().
    private long fileSize = -1;

     * States whether write-operations are allowed or not.
    private boolean writable = false;

     * States whether all write operations should be automatically followed by
     * a flush.
     * Note: The current implementation always flushes. This is expected to
     * change.
    // TODO: Make synchronize make a difference
    @SuppressWarnings({ "UnusedDeclaration", "FieldCanBeLocal" })
    private boolean synchronize = false;
    private int bufferSize;

     * Connects to the given file with the given mode. This corresponds to RandomAccessFile(File, String).
     * @param file the file to connect to.
     * @param mode the mode to use. Valid values are
     *             "r": read-only.
     *             "rw": read and write.
     *             "rws": read and write and synchronize after each write.
     *             "rwd": read and write and synchronize after each write.
     * @throws IOException if the file could not be accessed.
    public LineReader(File file, String mode) throws IOException {
        if (mode != null && mode.contains("w") && !file.exists()) {
            log.trace("Creating file '" + file + "'");
            if (!file.createNewFile()) {
                throw new IllegalStateException(
                        "File '" + file + "' already exists even though it was checked that is wasn't. "
                                + "Possible cause is concurrent access to the same file");
        if (mode == null) {
            log.debug("Mode == null, defaulting to read-only");
        } else if (mode.equals("r")) {
            writable = false; // Just to make sure
        } else if (mode.equals("rw")) {
            writable = true;
            // TODO: Check what the difference is between rws and rwd
        } else if (mode.equals("rws")) {
            writable = true;
            synchronize = true;
        } else if (mode.equals("rwd")) {
            writable = true;
            synchronize = true;
        } else {
            throw new IllegalArgumentException(
                    "The mode '" + mode + "' is " + "illegal. Legal values are " + "'r', 'rw', 'rws' and 'rwd");
        if (writable && !file.canWrite()) {
            throw new IOException("The file '" + file + "' is read-only");
        this.file = file;

     * The buffer size affects performance greatly.
     * Set this low (hundreds of bytes) if the file is large, the access very
     * random and the reads small.
     * Set this high (thousands of bytes) if the file is medium, the access
     * clustered and/or the reads are large.
     * Set this very high (the file size) if the file is small.
     * @param bufferSize the size of the buffer.
     * @throws IOException if an I/O exception occured while changing the
     *                     buffer.
    public void setBufferSize(int bufferSize) throws IOException {
        this.bufferSize = bufferSize;
        buffer = ByteBuffer.allocate(bufferSize);

     * @return the absolute position within the file.
    public long getPosition() {
        return position;

     * An alias for {@link #getPosition}.
     * @return the absolute position within the file.
    public long getFilePointer() {
        return getPosition();

     * Sets the absolute position within the file.
     * @param position the position in the given file. this must be equal to or
     *                 less than the file size.
     * @throws IOException if the position is not within the range of the file.
    public void seek(long position) throws IOException {
        //log.trace("seek(" + position + ") called");
        if (position > length()) {
            //noinspection DuplicateStringLiteralInspection
            throw new EOFException(
                    "Cannot set position " + position + " as the file size is only " + length() + " bytes");
        if (position < 0) {
            throw new IllegalArgumentException("The position cannot be negative");
        if (bufferStart != -1) {
            if (position < bufferStart || position >= bufferStart + bufferSize) {
            } else {
                // The new position is inside the existing buffer
                try {
                    buffer.position((int) (position - bufferStart));
                } catch (IllegalArgumentException e) {
                    throw new IllegalArgumentException(
                            "Trying to set the buffer position to " + position + " - " + bufferStart + " = "
                                    + (position - bufferStart) + " with a buffer of size " + getBufferSize(),
        this.position = position;

     * The length of this file in bytes.
     * @return the length of this file.
    public long length() {
        if (fileSize == -1) {
            fileSize = file.length();
        return Math.max(fileSize, dirty ? bufferStart + maxBufferPos : fileSize);

     * Reset the position in the file to 0 and free any open file handles.
     * Later access to the file is allowed, as it is automatically opened.
     * This is equivalent to {@link #close}.
     * @throws IOException if the file could not be closed.
    public void reset() throws IOException {

     * Ensure that the {@link #channelIn} is ready for reading.
     * @throws IOException if channelIn could not be opened.
    private void checkInputFile() throws IOException {
        if (inOpen) {
        log.trace("Opening input channel for '" + file + "'");
        //        input = new FileInputStream(file);
        input = new RandomAccessFile(file, "r");
        channelIn = input.getChannel();
        inOpen = true;

     * Ensure that the {@link #channelOut} is ready for writing.
     * @throws IOException           if channelOut could not be opened.
     * @throws IllegalStateException if the file opened in read-only mode..
    private void checkOutputFile() throws IOException {
        if (!writable) {
            throw new IllegalStateException(String.format("The file '%s' has been opened in read-only mode", file));
        if (outOpen) {
        log.trace("Opening output channel for '" + file + "'");
        output = new RandomAccessFile(file, "rw");
        //        output = new FileOutputStream(file, true);
        channelOut = output.getChannel();
        outOpen = true;

     * Reset the position in the file to 0 and free any open file handles.
     * Later access to the file is allowed, as it is automatically opened.
     * This is equivalent to {@link #reset}.
     * @throws IOException if the file could not be closed.
    public void close() throws IOException {
        position = 0;

     * Free any open file handles, but do not reset the position.
     * @throws IOException if the file could not be closed.
    private void closeNoReset() throws IOException {
        if (channelIn != null) {
        if (input != null) {
        inOpen = false;
        if (channelOut != null) {
        if (output != null) {
        outOpen = false;

     * Fill the buffer from the file at the current position, if it is not
     * already filled.
     * Note: This does not check whether the position of the buffer corresponds
     * to the global position.
     * @throws IOException if the buffer could not be filled.
    private void checkBuffer() throws IOException {
        if (bufferStart == -1) {
            if (dirty) {
                log.error("The buffer should not be dirty when bufferStart == -1");
            log.trace("checkBuffer: Seeking to position " + position);
            buffer.limit(buffer.capacity()); // Fill the buffer, please
            int readBytes =, position);
            log.trace("checkBuffer: mapped " + readBytes + " bytes to buffer");
            //            buffer.flip();
            //            buffer.limit(buffer.capacity());
            buffer.position(0); // Redundant?
            bufferStart = position;

     * Flush the contents of the buffer and mark it as invalid. Subsequent calls
     * to {@link #checkBuffer} will make the buffer valid again.
     * @throws IOException if the flushing failed.
    private void invalidateBuffer() throws IOException {
        bufferStart = -1;

    private void flushIfNeeded() throws IOException {
        // TODO: Only flush if synchronize is true

     * Flush any pending updates to disk.
     * @throws IOException if the buffer could not be flushed.
    public void flush() throws IOException {
        if (dirty) {
            assert bufferStart != -1 : "When the buffer is dirty, bufferStart should be >= 0";
            log.trace("Storing the buffer to disk");
            //            System.out.println(maxBufferPos + " " + bufferStart);
            if (log.isTraceEnabled()) {
                //noinspection DuplicateStringLiteralInspection
                log.trace("flush: bufferStart=" + bufferStart + ", maxBufferPos=" + maxBufferPos + ", buffer.limit="
                        + buffer.limit() + ", position=" + position);
            buffer.position(maxBufferPos); // Limit instead?
            dirty = false;
            buffer.clear(); // Do we need to do this?
            bufferStart = -1; // Do we need to do this?
            fileSize = -1; // Can we avoid this?
            maxBufferPos = 0;

    public File getFile() {
        return file;

    public int getBufferSize() {
        return bufferSize;

     * @return true if the reader has reached End Of File. Note: It is possible
     *         to perform writes, even if EOF has been reached.
    public boolean eof() {
        return position >= length();

    public int read() throws IOException {
        try {
            return readByte() & 0xFF;
        } catch (EOFException e) {
            return -1;

    /* ***************************** Readers ***********************************
     * These conform to the {@link DataInput} interface. JavaDocs are only     *
     * added where the behaviour is not as would be expected.                  *
     ************************************************************************ */

    public boolean readBoolean() throws IOException {
        return readByte() != 0;

    public int readUnsignedByte() throws IOException {
        return readByte() & 0xFF;

    public short readShort() throws IOException {
        return (short) (readByte() << 8 | readByte());

    public int readUnsignedShort() throws IOException {
        return readByte() << 8 | readByte();

    public char readChar() throws IOException {
        return (char) (readByte() << 8 | readByte());

    private byte[] readBuf = new byte[8];

    public int readInt() throws IOException {
        readFully(readBuf, 0, 4);
        return (readBuf[0] & 0xFF) << 24 | (readBuf[1] & 0xFF) << 16 | (readBuf[2] & 0xFF) << 8 | readBuf[3];

    public long readLong() throws IOException {
        readFully(readBuf, 0, 8);
        return (long) (readBuf[0] & 0xFF) << 56 | (long) (readBuf[1] & 0xFF) << 48
                | (long) (readBuf[2] & 0xFF) << 40 | (long) (readBuf[3] & 0xFF) << 32
                | (long) (readBuf[4] & 0xFF) << 24 | (long) (readBuf[5] & 0xFF) << 16
                | (long) (readBuf[6] & 0xFF) << 8 | readBuf[7];

    public float readFloat() throws IOException {
        return Float.intBitsToFloat(readInt());

    public double readDouble() throws IOException {
        return Double.longBitsToDouble(readInt());

    public byte readByte() throws IOException {
        //log.trace("readByte entered");
        if (eof()) {
            throw new EOFException("Attempted to read past EOF");
        byte b = buffer.get();
        if (position >= bufferStart + bufferSize) {
        return b;

    private ByteArrayOutputStream lineBuffer = new ByteArrayOutputStream(400);

     * Reads a line from the file, assuming UTF-8 and 0x0A as line break.
     * Note that this differs from {@link RandomAccessFile#readLine} with
     * regards to encoding and line breaks.
     * @return the text at the current position, until the next line break.
     *         If the line is the last in the file, all characters up to the
     *         end of the file will be returned.
     * @throws EOFException if no characters could be read.
     * @throws IOException  if a line could not be read.
    // TODO: Extend this to handle different line breaks
    public String readLine() throws IOException {
        //        log.trace("readLine entered");
        byte next;
        while (true) {
            try {
                next = readByte();
            } catch (EOFException e) {
                log.trace("Reached EOF in readLine()");
            if (next == 0x0A) {
                if (log.isTraceEnabled()) {
                    log.trace("Read " + lineBuffer.size() + " bytes in readLine");
        return lineBuffer.toString("utf-8");

    public String readUTF() throws IOException {
        return DataInputStream.readUTF(this);

    public void readFully(byte[] buf) throws IOException {
        readFully(buf, 0, buf.length);

    public void readFully(byte[] buf, int offset, int length) throws IOException {
        int got = read(buf, offset, length);
        if (got < length) {
            throw new EOFException("Reached end of file '" + file + "' at " + position + " with " + (length - got)
                    + " bytes yet to read");
        if (log.isTraceEnabled()) {
            log.trace("Read " + length + " bytes from file '" + file + "' from offset " + (position - length)
                    + " to " + position);

    public int skipBytes(int n) throws IOException {
        long skip = Math.min(n, length() - position);
        log.trace("Skipping " + skip + " bytes out of " + n + " wanted");
        seek(position + skip);
        return (int) skip;

     * Reads up to buf.length bytes into buf. If an EOF is reached before the
     * buf if filled, no exception is thrown.
     * @param buf the buffer to fill.
     * @return the amount of bytes read.
     * @throws EOFException if the End Of File was reached before any bytes
     *                      could be read.
     * @throws IOException  if an I/O error occured.
    public int read(byte[] buf) throws IOException {
        return read(buf, 0, buf.length);

     * Reads up to length bytes into buf.
     * @param buf    the buffer to fill.
     * @param offset where to start filling the buffer.
     * @param length the maximum amount of bytes to read.
     * @return the amount of bytes read.
     * @throws EOFException if the End Of File was reached before any bytes
     *                      could be read.
     * @throws IOException  if an I/O error occured.
    public int read(byte[] buf, int offset, int length) throws IOException {
        int read = 0;
        while (read < length) {
            if (eof()) {
                return read == 0 ? -1 : read;
            buf[offset++] = readByte();
        return read;

    /* **************************** Writers ************************************
     * These conform to the {@link DataOutput} interface. JavaDocs are only    *
     * added where the behaviour is not as would be expected.                  *
     ************************************************************************ */

     * Convert the given string to bytes in UTF-8 representation and write
     * this.
     * @param str the String to write to disk.
     * @throws IOException if the bytes could not be written.
    public void write(String str) throws IOException {

    public void write(int value) throws IOException {
        buffer.put((byte) (value & 0xFF));
        dirty = true;
        maxBufferPos = Math.max(maxBufferPos, buffer.position());
        position += 1;

    public void write(byte[] buffer) throws IOException {
        write(buffer, 0, buffer.length);

    public void write(byte[] buf, int offset, int length) throws IOException {
        if (offset + length > buf.length) {
            throw new IllegalArgumentException(
                    "Out of bounds: buf.length=" + buf.length + " offset=" + offset + " length=" + length);
        log.trace("write: Writing " + (length - offset) + " bytes at position " + position);
        int left = length;
        while (left > 0) {
            int writeLength = Math.min(left, bufferSize - buffer.position());
            if (log.isTraceEnabled()) {
                //noinspection DuplicateStringLiteralInspection
                log.trace("write: buf.length=" + buf.length + ", offset=" + offset + ", length=" + length
                        + ", writeLength=" + writeLength + ", bufferStart=" + bufferStart + ", buffer.position()="
                        + buffer.position() + ", position=" + position);
            try {
                buffer.put(buf, offset, writeLength);
            } catch (IndexOutOfBoundsException e) {
                throw new IOException("Buffer break while writing " + writeLength + " bytes from offset " + offset
                        + " in a buf with length " + buf.length);
            left -= writeLength;
            offset += writeLength;
            maxBufferPos = Math.max(maxBufferPos, buffer.position());
            dirty = true;
            position += writeLength;
            fileSize += writeLength; // TODO: This seems wrong!
        if (log.isTraceEnabled()) {
            log.trace("write: Wrote " + length + " bytes to file '" + file + "'");

    public void writeBoolean(boolean v) throws IOException {
        write(v ? 1 : 0);

    public void writeByte(int v) throws IOException {

    private byte[] outBytes = new byte[8];

    public void writeShort(int v) throws IOException {
        outBytes[0] = (byte) (0xff & v >> 8);
        outBytes[1] = (byte) (0xff & v);
        write(outBytes, 0, 2);

    public void writeChar(int v) throws IOException {
        outBytes[0] = (byte) (0xff & v >> 8);
        outBytes[1] = (byte) (0xff & v);
        write(outBytes, 0, 2);

    public void writeInt(int v) throws IOException {
        outBytes[0] = (byte) (0xff & v >> 24);
        outBytes[1] = (byte) (0xff & v >> 16);
        outBytes[2] = (byte) (0xff & v >> 8);
        outBytes[3] = (byte) (0xff & v);
        write(outBytes, 0, 4);

    public void writeLong(long v) throws IOException {
        outBytes[0] = (byte) (0xff & v >> 56);
        outBytes[1] = (byte) (0xff & v >> 48);
        outBytes[2] = (byte) (0xff & v >> 40);
        outBytes[3] = (byte) (0xff & v >> 32);
        outBytes[4] = (byte) (0xff & v >> 24);
        outBytes[5] = (byte) (0xff & v >> 16);
        outBytes[6] = (byte) (0xff & v >> 8);
        outBytes[7] = (byte) (0xff & v);
        write(outBytes, 0, 8);

    public void writeFloat(float v) throws IOException {

    public void writeDouble(double v) throws IOException {

    public void writeBytes(String s) throws IOException {
        char[] cBuf = s.toCharArray();
        byte[] bBuf = new byte[s.length()];
        for (int i = 0; i < s.length(); i++) {
            bBuf[i] = (byte) (cBuf[i] & 0xFF);

    public void writeChars(String s) throws IOException {
        char[] cBuf = s.toCharArray();
        byte[] bBuf = new byte[s.length() * 2];
        for (int i = 0; i < s.length(); i++) {
            bBuf[i * 2] = (byte) (cBuf[i] >> 8 & 0xFF);
            bBuf[i * 2 + 1] = (byte) (cBuf[i] & 0xFF);

    public void writeUTF(String str) throws IOException {
        throw new UnsupportedEncodingException(
                "This is not supported as the " + "necessary util is package " + "private in DataOutputStream");

     * Find the start-position of a line matching the given query.
     * A binary-search is used, thus requiring the user of the LineReader to
     * maintain specific structure and a matching comparator.
     * The expected structure is UTF-8 with new-line {@code "\n"} as
     * line-delimiters. As the byte {@code 0x0A} for new-line is never part
     * of a valid multi-byte UTF-8 character this should pose no problems.
     * Searching for an empty line is not supported. Escaping on line breaks is
     * the responsibility of the user.
     * Recommendation: Call {@link #setBufferSize(int)} with an amount
     * corresponding to the line-length. Keep in mind that binary searching
     * often result in a lot of lookups around the same position at the end
     * of the search, choosing the average length of a single line as the
     * buffer size is probably too small. If the lines are short (&lt; 20 chars),
     * use a value such as 400. If the lines are long (~100 chars), go for
     * 1000 or 2000. If the lines are very long (1000+), consider 4000 or 8000.
     * These are soft guidelines as the best values are also dependend of the
     * characteristica of the underlying storage: SSDs will normally benefit the
     * most from relatively small values, while conventional harddisks are
     * better off with larger values as the minimize seeks.
     * @param comparator used for the binary search. If the comparator is null, the default String.compareTo is used.
     *                   The comparator will be used with compare(query, line).
     * @param query      the element to look for. If comparator is null, this should be a full line.
     * @return the index of the query or {@code -(insertion point)-1} if it could not be found.
     * @throws IOException if reads of the underlying file failed.
    public long binaryLineSearch(Comparator<String> comparator, String query) throws IOException {
        long low = 0;
        long high = length() - 1;

        while (low <= high) {
            long mid = (low + high) >>> 1;

            if (mid != 0) {
                //noinspection StatementWithEmptyBody
                while (!eof() && readByte() != '\n')
            if (eof()) {
                high = mid - 1;
                //return (-1 * getPosition()) - 1;

            // Remember the line start position to return if we have a match
            long lineStart = getPosition();
            String line = readLine();
            int cmp = comparator == null ? query.compareTo(line) :, line);

            // Halve or return
            if (cmp < 0) {
                high = mid - 1;
            } else if (cmp > 0) {
                low = mid + 1;
            } else {
                return lineStart;

        return -(low + 1); // TODO: Should this be based on lineStart?