Java tutorial
/** * Copyright 2015 Google Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain * a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.cloud.hadoop.fs.gcs; import com.google.cloud.hadoop.gcsio.CreateFileOptions; import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystem; import com.google.cloud.hadoop.gcsio.GoogleCloudStorageFileSystemIntegrationHelper; import com.google.cloud.hadoop.gcsio.testing.InMemoryGoogleCloudStorage; import com.google.common.base.Strings; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; import java.nio.charset.StandardCharsets; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.junit.Assert; public class HadoopFileSystemIntegrationHelper extends GoogleCloudStorageFileSystemIntegrationHelper { FileSystem ghfs; FileSystemDescriptor ghfsFileSystemDescriptor; /** * FS statistics mode. */ public enum FileSystemStatistics { // No statistics available. NONE, // Statistics matches number of bytes written/read by caller. EXACT, // Statistics values reported are often greater than number of bytes // written/read by caller because of hidden underlying operations // involving check-summing. GREATER_OR_EQUAL, // We skip all FS statistics tests IGNORE, } // FS statistics mode of the FS tested by this class. FileSystemStatistics statistics = FileSystemStatistics.IGNORE; public HadoopFileSystemIntegrationHelper(FileSystem hfs, FileSystemDescriptor ghfsFileSystemDescriptor) throws IOException { super(new GoogleCloudStorageFileSystem(new InMemoryGoogleCloudStorage())); this.ghfs = hfs; this.ghfsFileSystemDescriptor = ghfsFileSystemDescriptor; } /** * Turn off statistics collection. */ public void setIgnoreStatistics() { statistics = FileSystemStatistics.IGNORE; } /** * Renames src path to dst path. */ @Override protected boolean rename(URI src, URI dst) throws IOException { Path srcHadoopPath = castAsHadoopPath(src); Path dstHadoopPath = castAsHadoopPath(dst); return ghfs.rename(srcHadoopPath, dstHadoopPath); } /** * Deletes the given path. */ @Override protected boolean delete(URI path, boolean recursive) throws IOException { Path hadoopPath = castAsHadoopPath(path); if (recursive) { // Allows delete(URI) to be covered by test. // Note that delete(URI) calls delete(URI, true). return ghfs.delete(hadoopPath); } else { return ghfs.delete(hadoopPath, recursive); } } /** * Creates the given directory and any non-existent parent directories. */ @Override protected boolean mkdirs(URI path) throws IOException { Path hadoopPath = castAsHadoopPath(path); return ghfs.mkdirs(hadoopPath); } /** * Indicates whether the given path exists. */ @Override protected boolean exists(URI path) throws IOException { Path hadoopPath = castAsHadoopPath(path); try { ghfs.getFileStatus(hadoopPath); return true; } catch (FileNotFoundException e) { return false; } } /** * Indicates whether the given path is directory. */ @Override protected boolean isDirectory(URI path) throws IOException { Path hadoopPath = castAsHadoopPath(path); try { FileStatus status = ghfs.getFileStatus(hadoopPath); return status.isDir(); } catch (FileNotFoundException e) { return false; } } /** * Opens the given object for reading. */ @Override protected SeekableByteChannel open(String bucketName, String objectName) throws IOException { return null; } /** * Opens the given object for writing. */ @Override protected WritableByteChannel create(String bucketName, String objectName, CreateFileOptions options) throws IOException { return null; } /** * Writes a file with the given buffer repeated numWrites times. * * @param bucketName Name of the bucket to create object in. * @param objectName Name of the object to create. * @param buffer Data to write * @param numWrites Number of times to repeat the data. * @return Number of bytes written. */ @Override protected int writeFile(String bucketName, String objectName, ByteBuffer buffer, int numWrites) throws IOException { Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, objectName); return writeFile(hadoopPath, buffer, numWrites, true); } /** * Helper which reads the entire file as a String. */ @Override protected String readTextFile(String bucketName, String objectName) throws IOException { Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, objectName); return readTextFile(hadoopPath); } /** * Helper which reads the entire file as a String. */ protected String readTextFile(Path hadoopPath) throws IOException { FSDataInputStream readStream = null; byte[] readBuffer = new byte[1024]; StringBuffer returnBuffer = new StringBuffer(); try { readStream = ghfs.open(hadoopPath, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT); int numBytesRead = readStream.read(readBuffer); while (numBytesRead > 0) { returnBuffer.append(new String(readBuffer, 0, numBytesRead, StandardCharsets.UTF_8)); numBytesRead = readStream.read(readBuffer); } } finally { if (readStream != null) { readStream.close(); } } return returnBuffer.toString(); } /** * Helper that reads text from the given bucket+object at the given offset * and returns it. If checkOverflow is true, it will make sure that * no more than 'len' bytes were read. */ @Override protected String readTextFile(String bucketName, String objectName, int offset, int len, boolean checkOverflow) throws IOException { Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, objectName); return readTextFile(hadoopPath, offset, len, checkOverflow); } /** * Helper that reads text from the given file at the given offset * and returns it. If checkOverflow is true, it will make sure that * no more than 'len' bytes were read. */ protected String readTextFile(Path hadoopPath, int offset, int len, boolean checkOverflow) throws IOException { String text = null; FSDataInputStream readStream = null; long fileSystemBytesRead = 0; FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); if (stats != null) { // Let it be null in case no stats have been added for our scheme yet. fileSystemBytesRead = stats.getBytesRead(); } try { int bufferSize = len; bufferSize += checkOverflow ? 1 : 0; byte[] readBuffer = new byte[bufferSize]; readStream = ghfs.open(hadoopPath, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT); int numBytesRead; if (offset > 0) { numBytesRead = readStream.read(offset, readBuffer, 0, bufferSize); } else { numBytesRead = readStream.read(readBuffer); } Assert.assertEquals(len, numBytesRead); text = new String(readBuffer, 0, numBytesRead, StandardCharsets.UTF_8); } finally { if (readStream != null) { readStream.close(); } } // After the read, the stats better be non-null for our ghfs scheme. stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); Assert.assertNotNull(stats); long endFileSystemBytesRead = stats.getBytesRead(); int bytesReadStats = (int) (endFileSystemBytesRead - fileSystemBytesRead); if (statistics == FileSystemStatistics.EXACT) { Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()), len, bytesReadStats); } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) { Assert.assertTrue(String.format("Expected %d <= %d", len, bytesReadStats), len <= bytesReadStats); } else if (statistics == FileSystemStatistics.NONE) { Assert.assertEquals("FS statistics expected to be 0", 0, fileSystemBytesRead); Assert.assertEquals("FS statistics expected to be 0", 0, endFileSystemBytesRead); } else if (statistics == FileSystemStatistics.IGNORE) { // NO-OP } return text; } /** * Creates a directory. */ @Override protected void mkdir(String bucketName, String objectName) throws IOException { Path path = createSchemeCompatibleHadoopPath(bucketName, objectName); ghfs.mkdirs(path); } /** * Creates a directory. */ @Override protected void mkdir(String bucketName) throws IOException { Path path = createSchemeCompatibleHadoopPath(bucketName, null); ghfs.mkdirs(path); } /** * Deletes the given item. */ @Override protected void delete(String bucketName) throws IOException { Path path = createSchemeCompatibleHadoopPath(bucketName, null); ghfs.delete(path, false); } /** * Deletes the given object. */ @Override protected void delete(String bucketName, String objectName) throws IOException { Path path = createSchemeCompatibleHadoopPath(bucketName, objectName); ghfs.delete(path, false); } /** * Deletes all objects from the given bucket. */ @Override protected void clearBucket(String bucketName) throws IOException { Path hadoopPath = createSchemeCompatibleHadoopPath(bucketName, null); FileStatus[] statusList = null; try { // Hadoop1 returns null on listStatus FileNotFound, Hadoop2 throws: statusList = ghfs.listStatus(hadoopPath); } catch (IOException ioe) { // Ignored. } if (statusList != null) { for (FileStatus status : statusList) { if (!ghfs.delete(status.getPath(), true)) { System.err.println(String.format("Failed to delete path: '%s'", status.getPath())); } } } } // ----------------------------------------------------------------- // Overridable methods added by this class. // ----------------------------------------------------------------- /** * Gets a Hadoop path using bucketName and objectName as components of a GCS URI, then casting * to a no-authority Hadoop path which follows the scheme indicated by the * ghfsFileSystemDescriptor. */ protected Path createSchemeCompatibleHadoopPath(String bucketName, String objectName) { URI gcsPath = getPath(bucketName, objectName); return castAsHadoopPath(gcsPath); } /** * Synthesizes a Hadoop path for the given GCS path by casting straight into the scheme indicated * by the ghfsFileSystemDescriptor instance; if the URI contains an 'authority', the authority * is re-interpreted as the topmost path component of a URI sitting inside the fileSystemRoot * indicated by the ghfsFileSystemDescriptor. * <p> * Examples: * gs:/// -> gsg:/ * gs://foo/bar -> gs://root-bucket/foo/bar * gs://foo/bar -> hdfs:/foo/bar * <p> * Note that it cannot be generally assumed that GCS-based filesystems will "invert" this path * back into the same GCS path internally; for example, if a bucket-rooted filesystem is based * in 'my-system-bucket', then this method will convert: * <p> * gs://foo/bar -> gs:/foo/bar * <p> * which will then be converted internally: * <p> * gs:/foo/bar -> gs://my-system-bucket/foo/bar * <p> * when the bucket-rooted FileSystem creates actual data in the underlying GcsFs. */ protected Path castAsHadoopPath(URI gcsPath) { String childPath = gcsPath.getRawPath(); if (childPath != null && childPath.startsWith("/")) { childPath = childPath.substring(1); } String authority = gcsPath.getAuthority(); if (Strings.isNullOrEmpty(authority)) { if (Strings.isNullOrEmpty(childPath)) { return ghfsFileSystemDescriptor.getFileSystemRoot(); } else { return new Path(ghfsFileSystemDescriptor.getFileSystemRoot(), childPath); } } else { if (Strings.isNullOrEmpty(childPath)) { return new Path(ghfsFileSystemDescriptor.getFileSystemRoot(), authority); } else { return new Path(ghfsFileSystemDescriptor.getFileSystemRoot(), new Path(authority, childPath)); } } } /** * Lists status of file(s) at the given path. */ protected FileStatus[] listStatus(Path hadoopPath) throws IOException { return ghfs.listStatus(hadoopPath); } // ----------------------------------------------------------------- // Misc test helpers. /** * Writes a file with the given buffer repeated numWrites times. * * @param hadoopPath Path of the file to create. * @param text Text data to write. * @param numWrites Number of times to repeat the data. * @param overwrite If true, overwrite any existing file. * @return Number of bytes written. */ public int writeFile(Path hadoopPath, String text, int numWrites, boolean overwrite) throws IOException { return writeFile(hadoopPath, ByteBuffer.wrap(text.getBytes("UTF-8")), numWrites, overwrite); } /** * Writes a file with the given buffer repeated numWrites times. * * @param hadoopPath Path of the file to create. * @param buffer Data to write. * @param numWrites Number of times to repeat the data. * @param overwrite If true, overwrite any existing file. * @return Number of bytes written. */ public int writeFile(Path hadoopPath, ByteBuffer buffer, int numWrites, boolean overwrite) throws IOException { int numBytesWritten = -1; int totalBytesWritten = 0; long fileSystemBytesWritten = 0; FileSystem.Statistics stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); if (stats != null) { // Let it be null in case no stats have been added for our scheme yet. fileSystemBytesWritten = stats.getBytesWritten(); } FSDataOutputStream writeStream = null; boolean allWritesSucceeded = false; try { writeStream = ghfs.create(hadoopPath, FsPermission.getDefault(), overwrite, GoogleHadoopFileSystemBase.BUFFERSIZE_DEFAULT, GoogleHadoopFileSystemBase.REPLICATION_FACTOR_DEFAULT, GoogleHadoopFileSystemBase.BLOCK_SIZE_DEFAULT, null); // progressable for (int i = 0; i < numWrites; i++) { buffer.clear(); writeStream.write(buffer.array(), 0, buffer.capacity()); numBytesWritten = buffer.capacity(); totalBytesWritten += numBytesWritten; } allWritesSucceeded = true; } finally { if (writeStream != null) { try { writeStream.close(); } catch (IOException e) { // Ignore IO exceptions while closing if write failed otherwise the // exception that caused the write to fail gets superseded. // On the other hand, if all writes succeeded then we need to know about the exception // that was thrown during closing. if (allWritesSucceeded) { throw e; } } } } // After the write, the stats better be non-null for our ghfs scheme. stats = FileSystem.getStatistics(ghfsFileSystemDescriptor.getScheme(), ghfs.getClass()); Assert.assertNotNull(stats); long endFileSystemBytesWritten = stats.getBytesWritten(); int bytesWrittenStats = (int) (endFileSystemBytesWritten - fileSystemBytesWritten); if (statistics == FileSystemStatistics.EXACT) { Assert.assertEquals(String.format("FS statistics mismatch fetched from class '%s'", ghfs.getClass()), totalBytesWritten, bytesWrittenStats); } else if (statistics == FileSystemStatistics.GREATER_OR_EQUAL) { Assert.assertTrue(String.format("Expected %d <= %d", totalBytesWritten, bytesWrittenStats), totalBytesWritten <= bytesWrittenStats); } else if (statistics == FileSystemStatistics.NONE) { // Do not perform any check because stats are either not maintained or are erratic. } else if (statistics == FileSystemStatistics.IGNORE) { // NO-OP } return totalBytesWritten; } public URI getPath(String bucketName, String objectName, boolean allowEmpty) { return gcsfs.getPathCodec().getPath(bucketName, objectName, allowEmpty); } }