Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.hdfs; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; import java.nio.file.FileAlreadyExistsException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.beam.sdk.io.FileSystem; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.io.fs.MatchResult.Status; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Adapts {@link org.apache.hadoop.fs.FileSystem} connectors to be used as Apache Beam {@link * FileSystem FileSystems}. * * <p>The following HDFS FileSystem(s) are known to be unsupported: * * <ul> * <li>FTPFileSystem: Missing seek support within FTPInputStream * </ul> * * <p>This implementation assumes that the underlying Hadoop {@link FileSystem} is seek efficient * when reading. The source code for the following {@link FSInputStream} implementations (as of * Hadoop 2.7.1) do provide seek implementations: * * <ul> * <li>HarFsInputStream * <li>S3InputStream * <li>DFSInputStream * <li>SwiftNativeInputStream * <li>NativeS3FsInputStream * <li>LocalFSFileInputStream * <li>NativeAzureFsInputStream * <li>S3AInputStream * </ul> */ class HadoopFileSystem extends FileSystem<HadoopResourceId> { private static final Logger LOG = LoggerFactory.getLogger(HadoopFileSystem.class); @VisibleForTesting static final String LOG_CREATE_DIRECTORY = "Creating directory %s"; @VisibleForTesting static final String LOG_DELETING_EXISTING_FILE = "Deleting existing file %s"; private final String scheme; @VisibleForTesting final Configuration configuration; HadoopFileSystem(String scheme, Configuration configuration) { this.scheme = scheme; this.configuration = configuration; } @Override protected List<MatchResult> match(List<String> specs) { ImmutableList.Builder<MatchResult> resultsBuilder = ImmutableList.builder(); for (String spec : specs) { try { final Set<Metadata> metadata = new HashSet<>(); if (spec.contains("**")) { // recursive glob int index = spec.indexOf("**"); metadata.addAll(matchRecursiveGlob(spec.substring(0, index + 1), spec.substring(index + 1))); } else { // normal glob final Path path = new Path(spec); final FileStatus[] fileStatuses = path.getFileSystem(configuration).globStatus(path); if (fileStatuses != null) { for (FileStatus fileStatus : fileStatuses) { metadata.add(toMetadata(fileStatus)); } } } if (metadata.isEmpty()) { resultsBuilder.add(MatchResult.create(Status.NOT_FOUND, Collections.emptyList())); } else { resultsBuilder.add(MatchResult.create(Status.OK, new ArrayList<>(metadata))); } } catch (IOException e) { resultsBuilder.add(MatchResult.create(Status.ERROR, e)); } } return resultsBuilder.build(); } private Set<Metadata> matchRecursiveGlob(String directorySpec, String fileSpec) throws IOException { final org.apache.hadoop.fs.FileSystem fs = new Path(directorySpec).getFileSystem(configuration); Set<Metadata> metadata = new HashSet<>(); if (directorySpec.contains("*")) { // An abstract directory with a wildcard is converted to concrete directories to search. FileStatus[] directoryStatuses = fs.globStatus(new Path(directorySpec)); for (FileStatus directoryStatus : directoryStatuses) { if (directoryStatus.isDirectory()) { metadata.addAll(matchRecursiveGlob(directoryStatus.getPath().toUri().toString(), fileSpec)); } } } else { // A concrete directory is searched. FileStatus[] fileStatuses = fs.globStatus(new Path(directorySpec + "/" + fileSpec)); for (FileStatus fileStatus : fileStatuses) { if (fileStatus.isFile()) { metadata.add(toMetadata(fileStatus)); } } // All sub-directories of a concrete directory are searched. FileStatus[] directoryStatuses = fs.globStatus(new Path(directorySpec + "/*")); for (FileStatus directoryStatus : directoryStatuses) { if (directoryStatus.isDirectory()) { metadata.addAll(matchRecursiveGlob(directoryStatus.getPath().toUri().toString(), fileSpec)); } } // Handle additional instances of recursive globs. if (fileSpec.contains("**")) { int index = fileSpec.indexOf("**"); metadata.addAll(matchRecursiveGlob(directorySpec + "/" + fileSpec.substring(0, index + 1), fileSpec.substring(index + 1))); } } return metadata; } private Metadata toMetadata(FileStatus fileStatus) { URI uri = dropEmptyAuthority(fileStatus.getPath().toUri().toString()); return Metadata.builder().setResourceId(new HadoopResourceId(uri)).setIsReadSeekEfficient(true) .setSizeBytes(fileStatus.getLen()).setLastModifiedMillis(fileStatus.getModificationTime()).build(); } @Override protected WritableByteChannel create(HadoopResourceId resourceId, CreateOptions createOptions) throws IOException { return Channels.newChannel(resourceId.toPath().getFileSystem(configuration).create(resourceId.toPath())); } @Override protected ReadableByteChannel open(HadoopResourceId resourceId) throws IOException { final org.apache.hadoop.fs.FileSystem fs = resourceId.toPath().getFileSystem(configuration); final FileStatus fileStatus = fs.getFileStatus(resourceId.toPath()); return new HadoopSeekableByteChannel(fileStatus, fs.open(resourceId.toPath())); } @Override protected void copy(List<HadoopResourceId> srcResourceIds, List<HadoopResourceId> destResourceIds) throws IOException { for (int i = 0; i < srcResourceIds.size(); ++i) { // this enforces src and dest file systems to match final org.apache.hadoop.fs.FileSystem fs = srcResourceIds.get(i).toPath().getFileSystem(configuration); // Unfortunately HDFS FileSystems don't support a native copy operation so we are forced // to use the inefficient implementation found in FileUtil which copies all the bytes through // the local machine. // // HDFS FileSystem does define a concat method but could only find the DFSFileSystem // implementing it. The DFSFileSystem implemented concat by deleting the srcs after which // is not what we want. Also, all the other FileSystem implementations I saw threw // UnsupportedOperationException within concat. final boolean success = FileUtil.copy(fs, srcResourceIds.get(i).toPath(), fs, destResourceIds.get(i).toPath(), false, true, fs.getConf()); if (!success) { // Defensive coding as this should not happen in practice throw new IOException(String.format( "Unable to copy resource %s to %s. No further information provided by underlying filesystem.", srcResourceIds.get(i).toPath(), destResourceIds.get(i).toPath())); } } } /** * Renames a {@link List} of file-like resources from one location to another. * * <p>The number of source resources must equal the number of destination resources. Destination * resources will be created recursively. * * @param srcResourceIds the references of the source resources * @param destResourceIds the references of the destination resources * @throws FileNotFoundException if the source resources are missing. When rename throws, the * state of the resources is unknown but safe: for every (source, destination) pair of * resources, the following are possible: a) source exists, b) destination exists, c) source * and destination both exist. Thus no data is lost, however, duplicated resource are * possible. In such scenarios, callers can use {@code match()} to determine the state of the * resource. * @throws FileAlreadyExistsException if a target resource already exists and couldn't be * overwritten. * @throws IOException if the underlying filesystem indicates the rename was not performed but no * other errors were thrown. */ @Override protected void rename(List<HadoopResourceId> srcResourceIds, List<HadoopResourceId> destResourceIds) throws IOException { for (int i = 0; i < srcResourceIds.size(); ++i) { final Path srcPath = srcResourceIds.get(i).toPath(); final Path destPath = destResourceIds.get(i).toPath(); // this enforces src and dest file systems to match final org.apache.hadoop.fs.FileSystem fs = srcPath.getFileSystem(configuration); // rename in HDFS requires the target directory to exist or silently fails (BEAM-4861) mkdirs(destPath); boolean success = fs.rename(srcPath, destPath); // If the failure was due to the file already existing, delete and retry (BEAM-5036). // This should be the exceptional case, so handle here rather than incur the overhead of // testing first if (!success && fs.exists(srcPath) && fs.exists(destPath)) { LOG.debug( String.format(LOG_DELETING_EXISTING_FILE, Path.getPathWithoutSchemeAndAuthority(destPath))); fs.delete(destPath, false); // not recursive success = fs.rename(srcPath, destPath); } if (!success) { if (!fs.exists(srcPath)) { throw new FileNotFoundException(String .format("Unable to rename resource %s to %s as source not found.", srcPath, destPath)); } else if (fs.exists(destPath)) { throw new FileAlreadyExistsException(String.format( "Unable to rename resource %s to %s as destination already exists and couldn't be deleted.", srcPath, destPath)); } else { throw new IOException(String.format( "Unable to rename resource %s to %s. No further information provided by underlying filesystem.", srcPath, destPath)); } } } } /** Ensures that the target directory exists for the given filePath. */ private void mkdirs(Path filePath) throws IOException { final org.apache.hadoop.fs.FileSystem fs = filePath.getFileSystem(configuration); final Path targetDirectory = filePath.getParent(); if (!fs.exists(targetDirectory)) { LOG.debug(String.format(LOG_CREATE_DIRECTORY, Path.getPathWithoutSchemeAndAuthority(targetDirectory))); if (!fs.mkdirs(targetDirectory)) { throw new IOException(String.format( "Unable to create target directory %s. No further information provided by underlying filesystem.", targetDirectory)); } } } @Override protected void delete(Collection<HadoopResourceId> resourceIds) throws IOException { for (HadoopResourceId resourceId : resourceIds) { // ignore response as issues are surfaced with exception final Path resourcePath = resourceId.toPath(); resourcePath.getFileSystem(configuration).delete(resourceId.toPath(), false); } } @Override protected HadoopResourceId matchNewResource(String singleResourceSpec, boolean isDirectory) { if (singleResourceSpec.endsWith("/") && !isDirectory) { throw new IllegalArgumentException( String.format("Expected file path but received directory path %s", singleResourceSpec)); } return !singleResourceSpec.endsWith("/") && isDirectory ? new HadoopResourceId(dropEmptyAuthority(singleResourceSpec + "/")) : new HadoopResourceId(dropEmptyAuthority(singleResourceSpec)); } @Override protected String getScheme() { return scheme; } /** An adapter around {@link FSDataInputStream} that implements {@link SeekableByteChannel}. */ private static class HadoopSeekableByteChannel implements SeekableByteChannel { private final FileStatus fileStatus; private final FSDataInputStream inputStream; private boolean closed; private HadoopSeekableByteChannel(FileStatus fileStatus, FSDataInputStream inputStream) { this.fileStatus = fileStatus; this.inputStream = inputStream; this.closed = false; } @Override public int read(ByteBuffer dst) throws IOException { if (closed) { throw new IOException("Channel is closed"); } // O length read must be supported int read = 0; // We avoid using the ByteBuffer based read for Hadoop because some FSDataInputStream // implementations are not ByteBufferReadable, // See https://issues.apache.org/jira/browse/HADOOP-14603 if (dst.hasArray()) { // does the same as inputStream.read(dst): // stores up to dst.remaining() bytes into dst.array() starting at dst.position(). // But dst can have an offset with its backing array hence the + dst.arrayOffset() read = inputStream.read(dst.array(), dst.position() + dst.arrayOffset(), dst.remaining()); } else { // TODO: Add support for off heap ByteBuffers in case the underlying FSDataInputStream // does not support reading from a ByteBuffer. read = inputStream.read(dst); } if (read > 0) { dst.position(dst.position() + read); } return read; } @Override public int write(ByteBuffer src) { throw new UnsupportedOperationException(); } @Override public long position() throws IOException { if (closed) { throw new IOException("Channel is closed"); } return inputStream.getPos(); } @Override public SeekableByteChannel position(long newPosition) throws IOException { if (closed) { throw new IOException("Channel is closed"); } inputStream.seek(newPosition); return this; } @Override public long size() throws IOException { if (closed) { throw new IOException("Channel is closed"); } return fileStatus.getLen(); } @Override public SeekableByteChannel truncate(long size) { throw new UnsupportedOperationException(); } @Override public boolean isOpen() { return !closed; } @Override public void close() throws IOException { closed = true; inputStream.close(); } } private static URI dropEmptyAuthority(String uriStr) { URI uri = URI.create(uriStr); String prefix = uri.getScheme() + ":///"; if (uriStr.startsWith(prefix)) { return URI.create(uri.getScheme() + ":/" + uriStr.substring(prefix.length())); } else { return uri; } } }