alluxio.hadoop.HdfsFileInputStream.java Source code

Java tutorial

Introduction

Here is the source code for alluxio.hadoop.HdfsFileInputStream.java

Source

/*
 * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
 * (the "License"). You may not use this work except in compliance with the License, which is
 * available at www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied, as more fully set forth in the License.
 *
 * See the NOTICE file distributed with this work for information regarding copyright ownership.
 */

package alluxio.hadoop;

import alluxio.AlluxioURI;
import alluxio.Configuration;
import alluxio.Constants;
import alluxio.PropertyKey;
import alluxio.client.file.FileInStream;
import alluxio.client.file.FileSystem;
import alluxio.client.file.URIStatus;
import alluxio.client.file.options.OpenFileOptions;
import alluxio.exception.AlluxioException;
import alluxio.exception.ExceptionMessage;
import alluxio.exception.FileDoesNotExistException;
import alluxio.util.io.BufferUtils;

import com.google.common.primitives.Ints;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem.Statistics;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import javax.annotation.concurrent.NotThreadSafe;

/**
 * An input stream for reading a file from HDFS.
 */
@NotThreadSafe
public class HdfsFileInputStream extends InputStream implements Seekable, PositionedReadable {
    private static final Logger LOG = LoggerFactory.getLogger(Constants.LOGGER_TYPE);

    private long mCurrentPosition;
    private Path mHdfsPath;
    private org.apache.hadoop.conf.Configuration mHadoopConf;
    private int mHadoopBufferSize;
    private Statistics mStatistics;
    private URIStatus mFileInfo;

    private FSDataInputStream mHdfsInputStream = null;

    private FileInStream mAlluxioFileInputStream = null;

    private boolean mClosed = false;

    private int mBufferLimit = 0;
    private int mBufferPosition = 0;
    private byte[] mBuffer;

    /**
     * Constructs a new stream for reading a file from HDFS.
     *
     * @param uri the Alluxio file URI
     * @param conf Hadoop configuration
     * @param bufferSize the buffer size
     * @param stats filesystem statistics
     * @throws IOException if the underlying file does not exist or its stream cannot be created
     */
    public HdfsFileInputStream(AlluxioURI uri, org.apache.hadoop.conf.Configuration conf, int bufferSize,
            org.apache.hadoop.fs.FileSystem.Statistics stats) throws IOException {
        LOG.debug("HdfsFileInputStream({}, {}, {}, {}, {})", uri, conf, bufferSize, stats);
        long bufferBytes = Configuration.getBytes(PropertyKey.USER_FILE_BUFFER_BYTES);
        mBuffer = new byte[Ints.checkedCast(bufferBytes) * 4];
        mCurrentPosition = 0;
        FileSystem fs = FileSystem.Factory.get();
        mHadoopConf = conf;
        mHadoopBufferSize = bufferSize;
        mStatistics = stats;
        try {
            mFileInfo = fs.getStatus(uri);
            mHdfsPath = new Path(mFileInfo.getUfsPath());
            mAlluxioFileInputStream = fs.openFile(uri, OpenFileOptions.defaults());
        } catch (FileDoesNotExistException e) {
            throw new FileNotFoundException(ExceptionMessage.HDFS_FILE_NOT_FOUND.getMessage(mHdfsPath, uri));
        } catch (AlluxioException e) {
            throw new IOException(e);
        }
    }

    @Override
    public int available() throws IOException {
        if (mClosed) {
            throw new IOException("Cannot query available bytes from a closed stream.");
        }
        return (int) mAlluxioFileInputStream.remaining();
    }

    @Override
    public void close() throws IOException {
        if (mAlluxioFileInputStream != null) {
            mAlluxioFileInputStream.close();
        }
        if (mHdfsInputStream != null) {
            mHdfsInputStream.close();
        }
        mClosed = true;
    }

    /**
     * Sets {@link #mHdfsInputStream} to a stream from the under storage system with the stream
     * starting at {@link #mCurrentPosition}.
     *
     * @throws IOException if opening the file fails
     */
    // TODO(calvin): Consider removing this when the recovery logic is available in FileInStream
    private void getHdfsInputStream() throws IOException {
        if (mHdfsInputStream == null) {
            org.apache.hadoop.fs.FileSystem fs = mHdfsPath.getFileSystem(mHadoopConf);
            mHdfsInputStream = fs.open(mHdfsPath, mHadoopBufferSize);
            mHdfsInputStream.seek(mCurrentPosition);
        }
    }

    /**
     * Sets {@link #mHdfsInputStream} to a stream from the under storage system with the stream
     * starting at position. The {@link #mCurrentPosition} is not modified to be position.
     *
     * @throws IOException if opening the file fails
     */
    private void getHdfsInputStream(long position) throws IOException {
        if (mHdfsInputStream == null) {
            org.apache.hadoop.fs.FileSystem fs = mHdfsPath.getFileSystem(mHadoopConf);
            mHdfsInputStream = fs.open(mHdfsPath, mHadoopBufferSize);
        }
        mHdfsInputStream.seek(position);
    }

    @Override
    public long getPos() throws IOException {
        return mCurrentPosition;
    }

    @Override
    public int read() throws IOException {
        if (mClosed) {
            throw new IOException("Cannot read from a closed stream.");
        }
        if (mAlluxioFileInputStream != null) {
            try {
                int ret = mAlluxioFileInputStream.read();
                if (mStatistics != null && ret != -1) {
                    mStatistics.incrementBytesRead(1);
                }
                mCurrentPosition++;
                return ret;
            } catch (IOException e) {
                LOG.error(e.getMessage(), e);
                mAlluxioFileInputStream.close();
                mAlluxioFileInputStream = null;
            }
        }
        getHdfsInputStream();
        return readFromHdfsBuffer();
    }

    @Override
    public int read(byte[] buffer) throws IOException {
        return read(buffer, 0, buffer.length);
    }

    @Override
    public int read(byte[] buffer, int offset, int length) throws IOException {
        if (mClosed) {
            throw new IOException("Cannot read from a closed stream.");
        }
        if (mAlluxioFileInputStream != null) {
            try {
                int ret = mAlluxioFileInputStream.read(buffer, offset, length);
                if (mStatistics != null && ret != -1) {
                    mStatistics.incrementBytesRead(ret);
                }
                mCurrentPosition += ret;
                return ret;
            } catch (IOException e) {
                LOG.error(e.getMessage(), e);
                mAlluxioFileInputStream.close();
                mAlluxioFileInputStream = null;
            }
        }

        getHdfsInputStream();
        int byteRead = readFromHdfsBuffer();
        // byteRead is an unsigned byte, if its -1 then we have hit EOF
        if (byteRead == -1) {
            return -1;
        }
        // Convert byteRead back to a signed byte
        buffer[offset] = (byte) byteRead;
        return 1;
    }

    @Override
    public synchronized int read(long position, byte[] buffer, int offset, int length) throws IOException {
        if (mClosed) {
            throw new IOException(ExceptionMessage.READ_CLOSED_STREAM.getMessage());
        }
        int ret = -1;
        long oldPos = getPos();
        if ((position < 0) || (position >= mFileInfo.getLength())) {
            return ret;
        }

        if (mAlluxioFileInputStream != null) {
            try {
                mAlluxioFileInputStream.seek(position);
                ret = mAlluxioFileInputStream.read(buffer, offset, length);
                if (mStatistics != null && ret != -1) {
                    mStatistics.incrementBytesRead(ret);
                }
                return ret;
            } finally {
                mAlluxioFileInputStream.seek(oldPos);
            }
        }

        try {
            getHdfsInputStream(position);
            ret = mHdfsInputStream.read(buffer, offset, length);
            if (mStatistics != null && ret != -1) {
                mStatistics.incrementBytesRead(ret);
            }
            return ret;
        } finally {
            if (mHdfsInputStream != null) {
                mHdfsInputStream.seek(oldPos);
            }
        }
    }

    /**
     * Similar to read(), returns a single unsigned byte from the hdfs buffer, or -1 if there is no
     * more data to be read. This method also fills the hdfs buffer with new data if it is empty.
     *
     * @return the next value in the stream from 0 to 255 or -1 if there is no more data to be read
     * @throws IOException if the bulk read from hdfs fails
     */
    private int readFromHdfsBuffer() throws IOException {
        if (mBufferPosition < mBufferLimit) {
            if (mStatistics != null) {
                mStatistics.incrementBytesRead(1);
            }
            mCurrentPosition++;
            return BufferUtils.byteToInt(mBuffer[mBufferPosition++]);
        }
        LOG.error("Reading from HDFS directly");
        while ((mBufferLimit = mHdfsInputStream.read(mBuffer)) == 0) {
            LOG.error("Read 0 bytes in readFromHdfsBuffer for {}", mHdfsPath);
        }
        if (mBufferLimit == -1) {
            return -1;
        }
        mBufferPosition = 0;
        if (mStatistics != null) {
            mStatistics.incrementBytesRead(1);
        }
        mCurrentPosition++;
        return BufferUtils.byteToInt(mBuffer[mBufferPosition++]);
    }

    @Override
    public void readFully(long position, byte[] buffer) throws IOException {
        readFully(position, buffer, 0, buffer.length);
    }

    @Override
    public void readFully(long position, byte[] buffer, int offset, int length) throws IOException {
        int n = 0; // total bytes read
        while (n < length) {
            int ret = read(position + n, buffer, offset + n, length - n);
            if (ret == -1) {
                throw new EOFException();
            }
            n += ret;
        }
    }

    /**
     * Seek to the given offset from the start of the file. The next {@link #read()} will be from that
     * location. Can't seek past the end of the file.
     *
     * @param pos the position to seek to
     * @throws IOException if the position is negative or exceeds the end of the file
     */
    @Override
    public void seek(long pos) throws IOException {
        if (pos == mCurrentPosition) {
            return;
        }

        if (pos < 0) {
            throw new IOException(ExceptionMessage.SEEK_NEGATIVE.getMessage(pos));
        }
        if (pos > mFileInfo.getLength()) {
            throw new IOException(ExceptionMessage.SEEK_PAST_EOF.getMessage(pos, mFileInfo.getLength()));
        }

        if (mAlluxioFileInputStream != null) {
            mAlluxioFileInputStream.seek(pos);
        } else {
            getHdfsInputStream(pos);
            // TODO(calvin): Optimize for the case when the data is still valid in the buffer
            // Invalidate buffer
            mBufferLimit = -1;
        }

        mCurrentPosition = pos;
    }

    /**
     * This method is not supported in {@link HdfsFileInputStream}.
     *
     * @param targetPos N/A
     * @return N/A
     * @throws IOException always
     */
    @Override
    public boolean seekToNewSource(long targetPos) throws IOException {
        throw new IOException(ExceptionMessage.NOT_SUPPORTED.getMessage());
    }
}