gobblin.util.HadoopUtils.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.util.HadoopUtils.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.util;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collection;
import java.util.List;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Queue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RawLocalFileSystem;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Queues;
import com.google.common.io.BaseEncoding;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigValue;

import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.util.deprecation.DeprecationUtils;
import gobblin.util.executors.ScalingThreadPoolExecutor;
import gobblin.writer.DataWriter;

/**
 * A utility class for working with Hadoop.
 */
@Slf4j
public class HadoopUtils {

    public static final String HDFS_ILLEGAL_TOKEN_REGEX = "[\\s:\\\\]";

    /**
     * A {@link Collection} of all known {@link FileSystem} schemes that do not support atomic renames or copies.
     *
     * <p>
     *   The following important properties are useful to remember when writing code that is compatible with S3:
     *   <ul>
     *     <li>Renames are not atomic, and require copying the entire source file to the destination file</li>
     *     <li>Writes to S3 using {@link FileSystem#create(Path)} will first go to the local filesystem, when the stream
     *     is closed the local file will be uploaded to S3</li>
     *   </ul>
     * </p>
     */
    public static final Collection<String> FS_SCHEMES_NON_ATOMIC = ImmutableSortedSet
            .orderedBy(String.CASE_INSENSITIVE_ORDER).add("s3").add("s3a").add("s3n").build();
    public static final String MAX_FILESYSTEM_QPS = "filesystem.throttling.max.filesystem.qps";
    private static final List<String> DEPRECATED_KEYS = Lists.newArrayList("gobblin.copy.max.filesystem.qps");
    private static final int MAX_RENAME_TRIES = 3;

    public static Configuration newConfiguration() {
        Configuration conf = new Configuration();

        // Explicitly check for S3 environment variables, so that Hadoop can access s3 and s3n URLs.
        // h/t https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
        String awsAccessKeyId = System.getenv("AWS_ACCESS_KEY_ID");
        String awsSecretAccessKey = System.getenv("AWS_SECRET_ACCESS_KEY");
        if (awsAccessKeyId != null && awsSecretAccessKey != null) {
            conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
            conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
            conf.set("fs.s3n.awsAccessKeyId", awsAccessKeyId);
            conf.set("fs.s3n.awsSecretAccessKey", awsSecretAccessKey);
        }

        // Add a new custom filesystem mapping
        conf.set("fs.sftp.impl", "gobblin.source.extractor.extract.sftp.SftpLightWeightFileSystem");
        conf.set("fs.sftp.impl.disable.cache", "true");
        return conf;
    }

    /**
     * @deprecated Use {@link FileListUtils#listFilesRecursively(FileSystem, Path)}.
     */
    @Deprecated
    public static List<FileStatus> listStatusRecursive(FileSystem fileSystem, Path path) throws IOException {
        List<FileStatus> results = Lists.newArrayList();
        walk(results, fileSystem, path);
        return results;
    }

    /**
     * Get the path as a string without schema or authority.
     *
     * E.g. Converts sftp://user/data/file.txt to /user/data/file.txt
     */
    public static String toUriPath(Path path) {
        return path.toUri().getPath();
    }

    /**
     * A wrapper around {@link FileSystem#delete(Path, boolean)} which throws {@link IOException} if the given
     * {@link Path} exists, and {@link FileSystem#delete(Path, boolean)} returns False.
     */
    public static void deletePath(FileSystem fs, Path f, boolean recursive) throws IOException {
        if (fs.exists(f) && !fs.delete(f, recursive)) {
            throw new IOException("Failed to delete: " + f);
        }
    }

    /**
     * A wrapper around {@link FileSystem#delete(Path, boolean)} that only deletes a given {@link Path} if it is present
     * on the given {@link FileSystem}.
     */
    public static void deleteIfExists(FileSystem fs, Path path, boolean recursive) throws IOException {
        if (fs.exists(path)) {
            deletePath(fs, path, recursive);
        }
    }

    public static void deletePathAndEmptyAncestors(FileSystem fs, Path f, boolean recursive) throws IOException {
        deletePath(fs, f, recursive);
        Path parent = f.getParent();
        while (parent != null) {
            if (fs.exists(parent) && fs.listStatus(parent).length == 0) {
                deletePath(fs, parent, true);
                parent = parent.getParent();
            } else {
                break;
            }
        }
    }

    /**
     * Renames a src {@link Path} on fs {@link FileSystem} to a dst {@link Path}. If fs is a {@link LocalFileSystem} and
     * src is a directory then {@link File#renameTo} is called directly to avoid a directory rename race condition where
     * {@link org.apache.hadoop.fs.RawLocalFileSystem#rename} copies the conflicting src directory into dst resulting in
     * an extra nested level, such as /root/a/b/c/e/e where e is repeated.
     *
     * @param fs the {@link FileSystem} where the src {@link Path} exists
     * @param src the source {@link Path} which will be renamed
     * @param dst the {@link Path} to rename to
     * @return true if rename succeeded, false if rename failed.
     * @throws IOException if rename failed for reasons other than target exists.
     */
    public static boolean renamePathHandleLocalFSRace(FileSystem fs, Path src, Path dst) throws IOException {
        if (DecoratorUtils.resolveUnderlyingObject(fs) instanceof LocalFileSystem && fs.isDirectory(src)) {
            LocalFileSystem localFs = (LocalFileSystem) DecoratorUtils.resolveUnderlyingObject(fs);
            File srcFile = localFs.pathToFile(src);
            File dstFile = localFs.pathToFile(dst);

            return srcFile.renameTo(dstFile);
        } else {
            return fs.rename(src, dst);
        }
    }

    /**
     * A wrapper around {@link FileSystem#rename(Path, Path)} which throws {@link IOException} if
     * {@link FileSystem#rename(Path, Path)} returns False.
     */
    public static void renamePath(FileSystem fs, Path oldName, Path newName) throws IOException {
        renamePath(fs, oldName, newName, false);
    }

    /**
     * A wrapper around {@link FileSystem#rename(Path, Path)} which throws {@link IOException} if
     * {@link FileSystem#rename(Path, Path)} returns False.
     */
    public static void renamePath(FileSystem fs, Path oldName, Path newName, boolean overwrite) throws IOException {
        if (!fs.exists(oldName)) {
            throw new FileNotFoundException(
                    String.format("Failed to rename %s to %s: src not found", oldName, newName));
        }
        if (fs.exists(newName)) {
            if (overwrite) {
                if (!fs.delete(newName, true)) {
                    throw new IOException(String.format("Failed to delete %s while renaming %s to %s", newName,
                            oldName, newName));
                }
            } else {
                throw new FileAlreadyExistsException(
                        String.format("Failed to rename %s to %s: dst already exists", oldName, newName));
            }
        }
        if (!fs.rename(oldName, newName)) {
            throw new IOException(String.format("Failed to rename %s to %s", oldName, newName));
        }
    }

    /**
     * Moves a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If
     * the srcFs and the dstFs have the same scheme, and neither of them or S3 schemes, then the {@link Path} is simply
     * renamed. Otherwise, the data is from the src {@link Path} to the dst {@link Path}. So this method can handle copying
     * data between different {@link FileSystem} implementations.
     *
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the source {@link Path} which will me moved
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to move data to
     */
    public static void movePath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Configuration conf)
            throws IOException {

        movePath(srcFs, src, dstFs, dst, false, conf);
    }

    /**
     * Moves a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If
     * the srcFs and the dstFs have the same scheme, and neither of them or S3 schemes, then the {@link Path} is simply
     * renamed. Otherwise, the data is from the src {@link Path} to the dst {@link Path}. So this method can handle copying
     * data between different {@link FileSystem} implementations.
     *
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the source {@link Path} which will me moved
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to move data to
     * @param overwrite true if the destination should be overwritten; otherwise, false
     */
    public static void movePath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite,
            Configuration conf) throws IOException {

        if (srcFs.getUri().getScheme().equals(dstFs.getUri().getScheme())
                && !FS_SCHEMES_NON_ATOMIC.contains(srcFs.getUri().getScheme())
                && !FS_SCHEMES_NON_ATOMIC.contains(dstFs.getUri().getScheme())) {
            renamePath(srcFs, src, dst);
        } else {
            copyPath(srcFs, src, dstFs, dst, true, overwrite, conf);
        }
    }

    /**
     * Copies data from a src {@link Path} to a dst {@link Path}.
     *
     * <p>
     *   This method should be used in preference to
     *   {@link FileUtil#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, Configuration)}, which does not handle
     *   clean up of incomplete files if there is an error while copying data.
     * </p>
     *
     * <p>
     *   TODO this method does not handle cleaning up any local files leftover by writing to S3.
     * </p>
     *
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the {@link Path} to copy from the source {@link FileSystem}
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to copy data to
     */
    public static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Configuration conf)
            throws IOException {

        copyPath(srcFs, src, dstFs, dst, false, false, conf);
    }

    /**
     * Copies data from a src {@link Path} to a dst {@link Path}.
     *
     * <p>
     *   This method should be used in preference to
     *   {@link FileUtil#copy(FileSystem, Path, FileSystem, Path, boolean, boolean, Configuration)}, which does not handle
     *   clean up of incomplete files if there is an error while copying data.
     * </p>
     *
     * <p>
     *   TODO this method does not handle cleaning up any local files leftover by writing to S3.
     * </p>
     *
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the {@link Path} to copy from the source {@link FileSystem}
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to copy data to
     * @param overwrite true if the destination should be overwritten; otherwise, false
     */
    public static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite,
            Configuration conf) throws IOException {

        copyPath(srcFs, src, dstFs, dst, false, overwrite, conf);
    }

    private static void copyPath(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean deleteSource,
            boolean overwrite, Configuration conf) throws IOException {

        Preconditions.checkArgument(srcFs.exists(src),
                String.format("Cannot copy from %s to %s because src does not exist", src, dst));
        Preconditions.checkArgument(overwrite || !dstFs.exists(dst),
                String.format("Cannot copy from %s to %s because dst exists", src, dst));

        try {
            boolean isSourceFileSystemLocal = srcFs instanceof LocalFileSystem
                    || srcFs instanceof RawLocalFileSystem;
            if (isSourceFileSystemLocal) {
                try {
                    dstFs.copyFromLocalFile(deleteSource, overwrite, src, dst);
                } catch (IOException e) {
                    throw new IOException(String.format("Failed to copy %s to %s", src, dst), e);
                }
            } else if (!FileUtil.copy(srcFs, src, dstFs, dst, deleteSource, overwrite, conf)) {
                throw new IOException(String.format("Failed to copy %s to %s", src, dst));
            }
        } catch (Throwable t1) {
            try {
                deleteIfExists(dstFs, dst, true);
            } catch (Throwable t2) {
                // Do nothing
            }
            throw t1;
        }
    }

    /**
     * Copies a src {@link Path} from a srcFs {@link FileSystem} to a dst {@link Path} on a dstFs {@link FileSystem}. If
     * either the srcFs or dstFs are S3 {@link FileSystem}s (as dictated by {@link #FS_SCHEMES_NON_ATOMIC}) then data is directly
     * copied from the src to the dst. Otherwise data is first copied to a tmp {@link Path}, which is then renamed to the
     * dst.
     *
     * @param srcFs the source {@link FileSystem} where the src {@link Path} exists
     * @param src the {@link Path} to copy from the source {@link FileSystem}
     * @param dstFs the destination {@link FileSystem} where the dst {@link Path} should be created
     * @param dst the {@link Path} to copy data to
     * @param tmp the temporary {@link Path} to use when copying data
     * @param overwriteDst true if the destination and tmp path should should be overwritten, false otherwise
     */
    public static void copyFile(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, Path tmp,
            boolean overwriteDst, Configuration conf) throws IOException {

        Preconditions.checkArgument(srcFs.isFile(src),
                String.format("Cannot copy from %s to %s because src is not a file", src, dst));

        if (FS_SCHEMES_NON_ATOMIC.contains(srcFs.getUri().getScheme())
                || FS_SCHEMES_NON_ATOMIC.contains(dstFs.getUri().getScheme())) {
            copyFile(srcFs, src, dstFs, dst, overwriteDst, conf);
        } else {
            copyFile(srcFs, src, dstFs, tmp, overwriteDst, conf);
            try {
                boolean renamed = false;
                if (overwriteDst && dstFs.exists(dst)) {
                    try {
                        deletePath(dstFs, dst, true);
                    } finally {
                        renamePath(dstFs, tmp, dst);
                        renamed = true;
                    }
                }
                if (!renamed) {
                    renamePath(dstFs, tmp, dst);
                }
            } finally {
                deletePath(dstFs, tmp, true);
            }
        }
    }

    /**
     * Copy a file from a srcFs {@link FileSystem} to a dstFs {@link FileSystem}. The src {@link Path} must be a file,
     * that is {@link FileSystem#isFile(Path)} must return true for src.
     *
     * <p>
     *   If overwrite is specified to true, this method may delete the dst directory even if the copy from src to dst fails.
     * </p>
     *
     * @param srcFs the src {@link FileSystem} to copy the file from
     * @param src the src {@link Path} to copy
     * @param dstFs the destination {@link FileSystem} to write to
     * @param dst the destination {@link Path} to write to
     * @param overwrite true if the dst {@link Path} should be overwritten, false otherwise
     */
    public static void copyFile(FileSystem srcFs, Path src, FileSystem dstFs, Path dst, boolean overwrite,
            Configuration conf) throws IOException {

        Preconditions.checkArgument(srcFs.isFile(src),
                String.format("Cannot copy from %s to %s because src is not a file", src, dst));
        Preconditions.checkArgument(overwrite || !dstFs.exists(dst),
                String.format("Cannot copy from %s to %s because dst exists", src, dst));

        try (InputStream in = srcFs.open(src); OutputStream out = dstFs.create(dst, overwrite)) {
            IOUtils.copyBytes(in, out, conf, false);
        } catch (Throwable t1) {
            try {
                deleteIfExists(dstFs, dst, true);
            } catch (Throwable t2) {
                // Do nothing
            }
            throw t1;
        }
    }

    private static void walk(List<FileStatus> results, FileSystem fileSystem, Path path) throws IOException {
        for (FileStatus status : fileSystem.listStatus(path)) {
            if (!status.isDirectory()) {
                results.add(status);
            } else {
                walk(results, fileSystem, status.getPath());
            }
        }
    }

    /**
     * This method is an additive implementation of the {@link FileSystem#rename(Path, Path)} method. It moves all the
     * files/directories under 'from' path to the 'to' path without overwriting existing directories in the 'to' path.
     *
     * <p>
     * The rename operation happens at the first non-existent sub-directory. If a directory at destination path already
     * exists, it recursively tries to move sub-directories. If all the sub-directories also exist at the destination,
     * a file level move is done
     * </p>
     *
     * @param fileSystem on which the data needs to be moved
     * @param from path of the data to be moved
     * @param to path of the data to be moved
     */
    public static void renameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException {

        log.info(String.format("Recursively renaming %s in %s to %s.", from, fileSystem.getUri(), to));

        FileSystem throttledFS = getOptionallyThrottledFileSystem(fileSystem, 10000);

        ExecutorService executorService = ScalingThreadPoolExecutor.newScalingThreadPool(1, 100, 100,
                ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of("rename-thread-%d")));
        Queue<Future<?>> futures = Queues.newConcurrentLinkedQueue();

        try {
            if (!fileSystem.exists(from)) {
                throw new IOException("Trying to rename a path that does not exist! " + from);
            }

            futures.add(executorService.submit(new RenameRecursively(throttledFS, fileSystem.getFileStatus(from),
                    to, executorService, futures)));
            int futuresUsed = 0;
            while (!futures.isEmpty()) {
                try {
                    futures.poll().get();
                    futuresUsed++;
                } catch (ExecutionException | InterruptedException ee) {
                    throw new IOException(ee.getCause());
                }
            }

            log.info(String.format("Recursive renaming of %s to %s. (details: used %d futures)", from, to,
                    futuresUsed));

        } finally {
            ExecutorsUtils.shutdownExecutorService(executorService, Optional.of(log), 1, TimeUnit.SECONDS);
        }
    }

    /**
     * Calls {@link #getOptionallyThrottledFileSystem(FileSystem, int)} parsing the qps from the input {@link State}
     * at key {@link #MAX_FILESYSTEM_QPS}.
     * @throws IOException
     */
    public static FileSystem getOptionallyThrottledFileSystem(FileSystem fs, State state) throws IOException {
        DeprecationUtils.renameDeprecatedKeys(state, MAX_FILESYSTEM_QPS, DEPRECATED_KEYS);

        if (state.contains(MAX_FILESYSTEM_QPS)) {
            return getOptionallyThrottledFileSystem(fs, state.getPropAsInt(MAX_FILESYSTEM_QPS));
        }
        return fs;
    }

    /**
     * Get a throttled {@link FileSystem} that limits the number of queries per second to a {@link FileSystem}. If
     * the input qps is <= 0, no such throttling will be performed.
     * @throws IOException
     */
    public static FileSystem getOptionallyThrottledFileSystem(FileSystem fs, int qpsLimit) throws IOException {
        if (fs instanceof Decorator) {
            for (Object obj : DecoratorUtils.getDecoratorLineage(fs)) {
                if (obj instanceof RateControlledFileSystem) {
                    // Already rate controlled
                    return fs;
                }
            }
        }

        if (qpsLimit > 0) {
            try {
                RateControlledFileSystem newFS = new RateControlledFileSystem(fs, qpsLimit);
                newFS.startRateControl();
                return newFS;
            } catch (ExecutionException ee) {
                throw new IOException("Could not create throttled FileSystem.", ee);
            }
        }
        return fs;
    }

    @AllArgsConstructor
    private static class RenameRecursively implements Runnable {

        private final FileSystem fileSystem;
        private final FileStatus from;
        private final Path to;
        private final ExecutorService executorService;
        private final Queue<Future<?>> futures;

        @Override
        public void run() {
            try {

                // Attempt to move safely if directory, unsafely if file (for performance, files are much less likely to collide on target)
                boolean moveSucessful = this.from.isDirectory()
                        ? safeRenameIfNotExists(this.fileSystem, this.from.getPath(), this.to)
                        : unsafeRenameIfNotExists(this.fileSystem, this.from.getPath(), this.to);

                if (!moveSucessful) {
                    if (this.from.isDirectory()) {
                        for (FileStatus fromFile : this.fileSystem.listStatus(this.from.getPath())) {
                            Path relativeFilePath = new Path(
                                    StringUtils.substringAfter(fromFile.getPath().toString(),
                                            this.from.getPath().toString() + Path.SEPARATOR));
                            Path toFilePath = new Path(this.to, relativeFilePath);
                            this.futures.add(this.executorService.submit(new RenameRecursively(this.fileSystem,
                                    fromFile, toFilePath, this.executorService, this.futures)));
                        }
                    } else {
                        log.info(String.format("File already exists %s. Will not rewrite", this.to));
                    }

                }

            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        }
    }

    /**
     * Renames from to to if to doesn't exist in a thread-safe way. This method is necessary because
     * {@link FileSystem#rename} is inconsistent across file system implementations, e.g. in some of them rename(foo, bar)
     * will create bar/foo if bar already existed, but it will only create bar if it didn't.
     *
     * <p>
     *   The thread-safety is only guaranteed among calls to this method. An external modification to the relevant
     *   target directory could still cause unexpected results in the renaming.
     * </p>
     *
     * @param fs filesystem where rename will be executed.
     * @param from origin {@link Path}.
     * @param to target {@link Path}.
     * @return true if rename succeeded, false if the target already exists.
     * @throws IOException if rename failed for reasons other than target exists.
     */
    public synchronized static boolean safeRenameIfNotExists(FileSystem fs, Path from, Path to) throws IOException {
        return unsafeRenameIfNotExists(fs, from, to);
    }

    /**
     * Renames from to to if to doesn't exist in a non-thread-safe way.
     *
     * @param fs filesystem where rename will be executed.
     * @param from origin {@link Path}.
     * @param to target {@link Path}.
     * @return true if rename succeeded, false if the target already exists.
     * @throws IOException if rename failed for reasons other than target exists.
     */
    public static boolean unsafeRenameIfNotExists(FileSystem fs, Path from, Path to) throws IOException {
        if (!fs.exists(to)) {
            if (!fs.exists(to.getParent())) {
                fs.mkdirs(to.getParent());
            }

            if (!renamePathHandleLocalFSRace(fs, from, to)) {
                if (!fs.exists(to)) {
                    throw new IOException(String.format("Failed to rename %s to %s.", from, to));
                }

                return false;
            }
            return true;
        }
        return false;
    }

    /**
     * A thread safe variation of {@link #renamePath(FileSystem, Path, Path)} which can be used in
     * multi-threaded/multi-mapper environment. The rename operation always happens at file level hence directories are
     * not overwritten under the 'to' path.
     *
     * <p>
     * If the contents of destination 'to' path is not expected to be modified concurrently, use
     * {@link #renamePath(FileSystem, Path, Path)} which is faster and more optimized
     * </p>
     *
     * <b>NOTE: This does not seem to be working for all {@link FileSystem} implementations. Use
     * {@link #renameRecursively(FileSystem, Path, Path)}</b>
     *
     * @param fileSystem on which the data needs to be moved
     * @param from path of the data to be moved
     * @param to path of the data to be moved
     *
     */
    public static void safeRenameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException {

        for (FileStatus fromFile : FileListUtils.listFilesRecursively(fileSystem, from)) {

            Path relativeFilePath = new Path(
                    StringUtils.substringAfter(fromFile.getPath().toString(), from.toString() + Path.SEPARATOR));

            Path toFilePath = new Path(to, relativeFilePath);

            if (!fileSystem.exists(toFilePath)) {
                boolean renamed = false;

                // underlying file open can fail with file not found error due to some race condition
                // when the parent directory is created in another thread, so retry a few times
                for (int i = 0; !renamed && i < MAX_RENAME_TRIES; i++) {
                    try {
                        renamed = fileSystem.rename(fromFile.getPath(), toFilePath);
                        break;
                    } catch (FileNotFoundException e) {
                        if (i + 1 >= MAX_RENAME_TRIES) {
                            throw e;
                        }
                    }
                }

                if (!renamed) {
                    throw new IOException(
                            String.format("Failed to rename %s to %s.", fromFile.getPath(), toFilePath));
                }
                log.info(String.format("Renamed %s to %s", fromFile.getPath(), toFilePath));
            } else {
                log.info(String.format("File already exists %s. Will not rewrite", toFilePath));
            }
        }
    }

    public static Configuration getConfFromState(State state) {
        return getConfFromState(state, Optional.<String>absent());
    }

    /**
     * Provides Hadoop configuration given state.
     * It also supports decrypting values on "encryptedPath".
     * Note that this encryptedPath path will be removed from full path of each config key and leaving only child path on the key(s).
     * If there's same config path as child path, the one stripped will have higher priority.
     *
     * e.g:
     * - encryptedPath: writer.fs.encrypted
     *   before: writer.fs.encrypted.secret
     *   after: secret
     *
     * Common use case for these encryptedPath:
     *   When there's have encrypted credential in job property but you'd like Filesystem to get decrypted value.
     *
     * @param srcConfig source config.
     * @param encryptedPath Optional. If provided, config that is on this path will be decrypted. @see ConfigUtils.resolveEncrypted
     *                      Note that config on encryptedPath will be included in the end result even it's not part of includeOnlyPath
     * @return Hadoop Configuration.
     */
    public static Configuration getConfFromState(State state, Optional<String> encryptedPath) {
        Config config = ConfigFactory.parseProperties(state.getProperties());
        if (encryptedPath.isPresent()) {
            config = ConfigUtils.resolveEncrypted(config, encryptedPath);
        }
        Configuration conf = newConfiguration();

        for (Entry<String, ConfigValue> entry : config.entrySet()) {
            conf.set(entry.getKey(), entry.getValue().unwrapped().toString());
        }
        return conf;
    }

    public static Configuration getConfFromProperties(Properties properties) {
        Configuration conf = newConfiguration();
        for (String propName : properties.stringPropertyNames()) {
            conf.set(propName, properties.getProperty(propName));
        }
        return conf;
    }

    public static State getStateFromConf(Configuration conf) {
        State state = new State();
        for (Entry<String, String> entry : conf) {
            state.setProp(entry.getKey(), entry.getValue());
        }
        return state;
    }

    /**
     * Set the group associated with a given path.
     *
     * @param fs the {@link FileSystem} instance used to perform the file operation
     * @param path the given path
     * @param group the group associated with the path
     * @throws IOException
     */
    public static void setGroup(FileSystem fs, Path path, String group) throws IOException {
        fs.setOwner(path, fs.getFileStatus(path).getOwner(), group);
    }

    /**
     * Serialize a {@link Writable} object into a string.
     *
     * @param writable the {@link Writable} object to be serialized
     * @return a string serialized from the {@link Writable} object
     * @throws IOException if there's something wrong with the serialization
     */
    public static String serializeToString(Writable writable) throws IOException {
        try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream)) {
            writable.write(dataOutputStream);
            return BaseEncoding.base64().encode(byteArrayOutputStream.toByteArray());
        }
    }

    /**
     * Deserialize a {@link Writable} object from a string.
     *
     * @param writableClass the {@link Writable} implementation class
     * @param serializedWritableStr the string containing a serialized {@link Writable} object
     * @return a {@link Writable} deserialized from the string
     * @throws IOException if there's something wrong with the deserialization
     */
    public static Writable deserializeFromString(Class<? extends Writable> writableClass,
            String serializedWritableStr) throws IOException {
        return deserializeFromString(writableClass, serializedWritableStr, new Configuration());
    }

    /**
     * Deserialize a {@link Writable} object from a string.
     *
     * @param writableClass the {@link Writable} implementation class
     * @param serializedWritableStr the string containing a serialized {@link Writable} object
     * @param configuration a {@link Configuration} object containing Hadoop configuration properties
     * @return a {@link Writable} deserialized from the string
     * @throws IOException if there's something wrong with the deserialization
     */
    public static Writable deserializeFromString(Class<? extends Writable> writableClass,
            String serializedWritableStr, Configuration configuration) throws IOException {
        byte[] writableBytes = BaseEncoding.base64().decode(serializedWritableStr);

        try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(writableBytes);
                DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream)) {
            Writable writable = ReflectionUtils.newInstance(writableClass, configuration);
            writable.readFields(dataInputStream);
            return writable;
        }
    }

    /**
     * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to
     * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}.
     */
    public static void serializeWriterFilePermissions(State state, int numBranches, int branchId,
            FsPermission fsPermissions) {
        serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), fsPermissions);
    }

    /**
     * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to
     * use when creating files. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}.
     */
    public static void serializeWriterDirPermissions(State state, int numBranches, int branchId,
            FsPermission fsPermissions) {
        serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), fsPermissions);
    }

    /**
     * Helper method that serializes a {@link FsPermission} object.
     */
    private static void serializeFsPermissions(State state, String key, FsPermission fsPermissions) {
        state.setProp(key, String.format("%04o", fsPermissions.toShort()));
    }

    /**
     * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to
     * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}.
     */
    public static void setWriterFileOctalPermissions(State state, int numBranches, int branchId,
            String octalPermissions) {
        state.setProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS,
                numBranches, branchId), octalPermissions);
    }

    /**
     * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to
     * use when creating directories. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}.
     */
    public static void setWriterDirOctalPermissions(State state, int numBranches, int branchId,
            String octalPermissions) {
        state.setProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS,
                numBranches, branchId), octalPermissions);
    }

    /**
     * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is writing a file.
     */
    public static FsPermission deserializeWriterFilePermissions(State state, int numBranches, int branchId) {
        return new FsPermission(state.getPropAsShortWithRadix(
                ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches,
                        branchId),
                FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX));
    }

    /**
     * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is creating directories.
     */
    public static FsPermission deserializeWriterDirPermissions(State state, int numBranches, int branchId) {
        return new FsPermission(state.getPropAsShortWithRadix(
                ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches,
                        branchId),
                FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX));
    }

    /**
     * Get {@link FsPermission} from a {@link State} object.
     *
     * @param props A {@link State} containing properties.
     * @param propName The property name for the permission. If not contained in the given state,
     * defaultPermission will be used.
     * @param defaultPermission default permission if propName is not contained in props.
     * @return An {@link FsPermission} object.
     */
    public static FsPermission deserializeFsPermission(State props, String propName,
            FsPermission defaultPermission) {
        short mode = props.getPropAsShortWithRadix(propName, defaultPermission.toShort(),
                ConfigurationKeys.PERMISSION_PARSING_RADIX);
        return new FsPermission(mode);
    }

    /**
     * Remove illegal HDFS path characters from the given path. Illegal characters will be replaced
     * with the given substitute.
     */
    public static String sanitizePath(String path, String substitute) {
        Preconditions.checkArgument(substitute.replaceAll(HDFS_ILLEGAL_TOKEN_REGEX, "").equals(substitute),
                "substitute contains illegal characters: " + substitute);

        return path.replaceAll(HDFS_ILLEGAL_TOKEN_REGEX, substitute);
    }

    /**
     * Remove illegal HDFS path characters from the given path. Illegal characters will be replaced
     * with the given substitute.
     */
    public static Path sanitizePath(Path path, String substitute) {
        return new Path(sanitizePath(path.toString(), substitute));
    }

    /**
     * Try to set owner and permissions for the path. Will not throw exception.
     */
    public static void setPermissions(Path location, Optional<String> owner, Optional<String> group, FileSystem fs,
            FsPermission permission) {
        try {
            if (!owner.isPresent()) {
                return;
            }
            if (!group.isPresent()) {
                return;
            }
            fs.setOwner(location, owner.get(), group.get());
            fs.setPermission(location, permission);
            if (!fs.isDirectory(location)) {
                return;
            }
            for (FileStatus fileStatus : fs.listStatus(location)) {
                setPermissions(fileStatus.getPath(), owner, group, fs, permission);
            }
        } catch (IOException e) {
            log.warn("Exception occurred while trying to change permissions : " + e.getMessage());
        }
    }

    public static boolean hasContent(FileSystem fs, Path path) throws IOException {
        if (!fs.isDirectory(path)) {
            return true;
        }
        boolean content = false;
        for (FileStatus fileStatus : fs.listStatus(path)) {
            content = content || hasContent(fs, fileStatus.getPath());
            if (content) {
                break;
            }
        }
        return content;
    }

    /**
     * Add "gobblin-site.xml" as a {@link Configuration} resource.
     */
    public static void addGobblinSite() {
        Configuration.addDefaultResource("gobblin-site.xml");
    }
}