Source code

Java tutorial


Here is the source code for


 * Copyright 2015 Google Inc. All Rights Reserved.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may obtain
 * a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.


import java.nio.ByteBuffer;
import java.nio.channels.SeekableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.junit.Assert;

public class HadoopFileSystemIntegrationHelper extends GoogleCloudStorageFileSystemIntegrationHelper {

    FileSystem ghfs;
    FileSystemDescriptor ghfsFileSystemDescriptor;

     * FS statistics mode.
    public enum FileSystemStatistics {
        // No statistics available.

        // Statistics matches number of bytes written/read by caller.

        // Statistics values reported are often greater than number of bytes
        // written/read by caller because of hidden underlying operations
        // involving check-summing.

        // We skip all FS statistics tests

    // FS statistics mode of the FS tested by this class.
    FileSystemStatistics statistics = FileSystemStatistics.IGNORE;

    public HadoopFileSystemIntegrationHelper(FileSystem hfs, FileSystemDescriptor ghfsFileSystemDescriptor)
            throws IOException {
        super(new GoogleCloudStorageFileSystem(new InMemoryGoogleCloudStorage()));
        this.ghfs = hfs;
        this.ghfsFileSystemDescriptor = ghfsFileSystemDescriptor;

     * Turn off statistics collection.
    public void setIgnoreStatistics() {
        statistics = FileSystemStatistics.IGNORE;

     * Renames src path to dst path.
    protected boolean rename(URI src, URI dst) throws IOException {
        Path srcHadoopPath = castAsHadoopPath(src);
        Path dstHadoopPath = castAsHadoopPath(dst);

        return ghfs.rename(srcHadoopPath, dstHadoopPath);

     * Deletes the given path.
    protected boolean delete(URI path, boolean recursive) throws IOException {
        Path hadoopPath = castAsHadoopPath(path);
        if (recursive) {
            // Allows delete(URI) to be covered by test.
            // Note that delete(URI) calls delete(URI, true).
            return ghfs.delete(hadoopPath);
        } else {
            return ghfs.delete(hadoopPath, recursive);

     * Creates the given directory and any non-existent parent directories.
    protected boolean mkdirs(URI path) throws IOException {
        Path hadoopPath = castAsHadoopPath(path);
        return ghfs.mkdirs(hadoopPath);

     * Indicates whether the given path exists.
    protected boolean exists(URI path) throws IOException {
        Path hadoopPath = castAsHadoopPath(path);
        try {
            return true;
        } catch (FileNotFoundException e) {
            return false;

     * Indicates whether the given path is directory.
    protected boolean isDirectory(URI path) throws IOException {
        Path hadoopPath = castAsHadoopPath(path);
        try {
            FileStatus status = ghfs.getFileStatus(hadoopPath);
            return status.isDir();
        } catch (FileNotFoundException e) {
            return false;

     * Opens the given object for reading.
    protected SeekableByteChannel open(String bucketName, String objectName) throws IOException {
        return null;

     * Opens the given object for writing.
    protected WritableByteChannel create(String bucketName, String objectName, CreateFileOptions options)
            throws IOException {
        return null;

     * Writes a file with the given buffer repeated numWrites times.
     * @param bucketName Name of the bucket to create object in.
     * @param objectName Name of the object to create.
     * @param buffer Data to write
     * @param numWrites Number of times to repeat the data.
     * @return Number of bytes written.
    protected int writeFile(String bucketName, String objectName, ByteBuffer buffer, int numWrites)
            throws IOException {
        Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, objectName);
        return writeFile(hadoopPath, buffer, numWrites, true);

     * Helper which reads the entire file as a String.
    protected String readTextFile(String bucketName, String objectName) throws IOException {
        Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, objectName);
        return readTextFile(hadoopPath);

     * Helper which reads the entire file as a String.
    protected String readTextFile(Path hadoopPath) throws IOException {
        FSDataInputStream readStream = null;
        byte[] readBuffer = new byte[1024];
        StringBuffer returnBuffer = new StringBuffer();

        try {
            readStream =, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT);
            int numBytesRead =;
            while (numBytesRead > 0) {
                returnBuffer.append(new String(readBuffer, 0, numBytesRead, StandardCharsets.UTF_8));
                numBytesRead =;
        } finally {
            if (readStream != null) {
        return returnBuffer.toString();

     * Helper that reads text from the given bucket+object at the given offset
     * and returns it. If checkOverflow is true, it will make sure that
     * no more than 'len' bytes were read.
    protected String readTextFile(String bucketName, String objectName, int offset, int len, boolean checkOverflow)
            throws IOException {
        Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, objectName);
        return readTextFile(hadoopPath, offset, len, checkOverflow);

     * Helper that reads text from the given file at the given offset
     * and returns it. If checkOverflow is true, it will make sure that
     * no more than 'len' bytes were read.
    protected String readTextFile(Path hadoopPath, int offset, int len, boolean checkOverflow) throws IOException {
        String text = null;
        FSDataInputStream readStream = null;
        long fileSystemBytesRead = 0;
        FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(),
        if (stats != null) {
            // Let it be null in case no stats have been added for our scheme yet.
            fileSystemBytesRead = stats.getBytesRead();

        try {
            int bufferSize = len;
            bufferSize += checkOverflow ? 1 : 0;
            byte[] readBuffer = new byte[bufferSize];
            readStream =, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT);
            int numBytesRead;
            if (offset > 0) {
                numBytesRead =, readBuffer, 0, bufferSize);
            } else {
                numBytesRead =;
            Assert.assertEquals(len, numBytesRead);
            text = new String(readBuffer, 0, numBytesRead, StandardCharsets.UTF_8);
        } finally {
            if (readStream != null) {

        // After the read, the stats better be non-null for our ghfs scheme.
        stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass());
        long endFileSystemBytesRead = stats.getBytesRead();
        int bytesReadStats = (int) (endFileSystemBytesRead - fileSystemBytesRead);
        if (statistics == FileSystemStatistics.EXACT) {
            Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()),
                    len, bytesReadStats);
        } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) {
            Assert.assertTrue(String.format("Expected %d <= %d", len, bytesReadStats), len <= bytesReadStats);
        } else if (statistics == FileSystemStatistics.NONE) {
            Assert.assertEquals("FS statistics expected to be 0", 0, fileSystemBytesRead);
            Assert.assertEquals("FS statistics expected to be 0", 0, endFileSystemBytesRead);
        } else if (statistics == FileSystemStatistics.IGNORE) {
            // NO-OP

        return text;

     * Creates a directory.
    protected void mkdir(String bucketName, String objectName) throws IOException {
        Path path = createSchemeCompatibleHadoopPath(bucketName, objectName);

     * Creates a directory.
    protected void mkdir(String bucketName) throws IOException {
        Path path = createSchemeCompatibleHadoopPath(bucketName, null);

     * Deletes the given item.
    protected void delete(String bucketName) throws IOException {
        Path path = createSchemeCompatibleHadoopPath(bucketName, null);
        ghfs.delete(path, false);

     * Deletes the given object.
    protected void delete(String bucketName, String objectName) throws IOException {
        Path path = createSchemeCompatibleHadoopPath(bucketName, objectName);
        ghfs.delete(path, false);

     * Deletes all objects from the given bucket.
    protected void clearBucket(String bucketName) throws IOException {
        Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, null);
        FileStatus[] statusList = null;
        try {
            // Hadoop1 returns null on listStatus FileNotFound, Hadoop2 throws:
            statusList = ghfs.listStatus(hadoopPath);
        } catch (IOException ioe) {
            // Ignored.

        if (statusList != null) {
            for (FileStatus status : statusList) {
                if (!ghfs.delete(status.getPath(), true)) {
                    System.err.println(String.format("Failed to delete path: '%s'", status.getPath()));

    // -----------------------------------------------------------------
    // Overridable methods added by this class.
    // -----------------------------------------------------------------

     * Gets a Hadoop path using bucketName and objectName as components of a GCS URI, then casting
     * to a no-authority Hadoop path which follows the scheme indicated by the
     * ghfsFileSystemDescriptor.
    protected Path createSchemeCompatibleHadoopPath(String bucketName, String objectName) {
        URI gcsPath = getPath(bucketName, objectName);
        return castAsHadoopPath(gcsPath);

     * Synthesizes a Hadoop path for the given GCS path by casting straight into the scheme indicated
     * by the ghfsFileSystemDescriptor instance; if the URI contains an 'authority', the authority
     * is re-interpreted as the topmost path component of a URI sitting inside the fileSystemRoot
     * indicated by the ghfsFileSystemDescriptor.
     * <p>
     * Examples:
     *   gs:/// -> gsg:/
     *   gs://foo/bar -> gs://root-bucket/foo/bar
     *   gs://foo/bar -> hdfs:/foo/bar
     * <p>
     * Note that it cannot be generally assumed that GCS-based filesystems will "invert" this path
     * back into the same GCS path internally; for example, if a bucket-rooted filesystem is based
     * in 'my-system-bucket', then this method will convert:
     * <p>
     *   gs://foo/bar -> gs:/foo/bar
     * <p>
     * which will then be converted internally:
     * <p>
     *   gs:/foo/bar -> gs://my-system-bucket/foo/bar
     * <p>
     * when the bucket-rooted FileSystem creates actual data in the underlying GcsFs.
    protected Path castAsHadoopPath(URI gcsPath) {
        String childPath = gcsPath.getRawPath();
        if (childPath != null && childPath.startsWith("/")) {
            childPath = childPath.substring(1);
        String authority = gcsPath.getAuthority();
        if (Strings.isNullOrEmpty(authority)) {
            if (Strings.isNullOrEmpty(childPath)) {
                return ghfsFileSystemDescriptor.getFileSystemRoot();
            } else {
                return new Path(ghfsFileSystemDescriptor.getFileSystemRoot(), childPath);
        } else {
            if (Strings.isNullOrEmpty(childPath)) {
                return new Path(ghfsFileSystemDescriptor.getFileSystemRoot(), authority);
            } else {
                return new Path(ghfsFileSystemDescriptor.getFileSystemRoot(), new Path(authority, childPath));

     * Lists status of file(s) at the given path.
    protected FileStatus[] listStatus(Path hadoopPath) throws IOException {
        return ghfs.listStatus(hadoopPath);

    // -----------------------------------------------------------------
    // Misc test helpers.

     * Writes a file with the given buffer repeated numWrites times.
     * @param hadoopPath Path of the file to create.
     * @param text Text data to write.
     * @param numWrites Number of times to repeat the data.
     * @param overwrite If true, overwrite any existing file.
     * @return Number of bytes written.
    public int writeFile(Path hadoopPath, String text, int numWrites, boolean overwrite) throws IOException {
        return writeFile(hadoopPath, ByteBuffer.wrap(text.getBytes("UTF-8")), numWrites, overwrite);

     * Writes a file with the given buffer repeated numWrites times.
     * @param hadoopPath Path of the file to create.
     * @param buffer Data to write.
     * @param numWrites Number of times to repeat the data.
     * @param overwrite If true, overwrite any existing file.
     * @return Number of bytes written.
    public int writeFile(Path hadoopPath, ByteBuffer buffer, int numWrites, boolean overwrite) throws IOException {
        int numBytesWritten = -1;
        int totalBytesWritten = 0;

        long fileSystemBytesWritten = 0;
        FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(),
        if (stats != null) {
            // Let it be null in case no stats have been added for our scheme yet.
            fileSystemBytesWritten = stats.getBytesWritten();
        FSDataOutputStream writeStream = null;
        boolean allWritesSucceeded = false;

        try {
            writeStream = ghfs.create(hadoopPath, FsPermission.getDefault(), overwrite,
                    GoogleHadoopFileSystemBase.BLOCK_SIZE_DEFAULT, null); // progressable

            for (int i = 0; i < numWrites; i++) {
                writeStream.write(buffer.array(), 0, buffer.capacity());
                numBytesWritten = buffer.capacity();
                totalBytesWritten += numBytesWritten;
            allWritesSucceeded = true;
        } finally {
            if (writeStream != null) {
                try {
                } catch (IOException e) {
                    // Ignore IO exceptions while closing if write failed otherwise the
                    // exception that caused the write to fail gets superseded.
                    // On the other hand, if all writes succeeded then we need to know about the exception
                    // that was thrown during closing.
                    if (allWritesSucceeded) {
                        throw e;

        // After the write, the stats better be non-null for our ghfs scheme.
        stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass());
        long endFileSystemBytesWritten = stats.getBytesWritten();
        int bytesWrittenStats = (int) (endFileSystemBytesWritten - fileSystemBytesWritten);
        if (statistics == FileSystemStatistics.EXACT) {
            Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()),
                    totalBytesWritten, bytesWrittenStats);
        } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) {
            Assert.assertTrue(String.format("Expected %d <= %d", totalBytesWritten, bytesWrittenStats),
                    totalBytesWritten <= bytesWrittenStats);
        } else if (statistics == FileSystemStatistics.NONE) {
            // Do not perform any check because stats are either not maintained or are erratic.
        } else if (statistics == FileSystemStatistics.IGNORE) {
            // NO-OP

        return totalBytesWritten;

    public URI getPath(String bucketName, String objectName, boolean allowEmpty) {
        return gcsfs.getPathCodec().getPath(bucketName, objectName, allowEmpty);