org.apache.hadoop.tools.DistCp.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.tools.DistCp.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools;

import java.io.IOException;
import java.util.Random;

import com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Cluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.tools.CopyListing.*;
import org.apache.hadoop.tools.mapred.CopyMapper;
import org.apache.hadoop.tools.mapred.CopyOutputFormat;
import org.apache.hadoop.tools.util.DistCpUtils;
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.google.common.annotations.VisibleForTesting;

/**
 * DistCp is the main driver-class for DistCpV2.
 * For command-line use, DistCp::main() orchestrates the parsing of command-line
 * parameters and the launch of the DistCp job.
 * For programmatic use, a DistCp object can be constructed by specifying
 * options (in a DistCpOptions object), and DistCp::execute() may be used to
 * launch the copy-job. DistCp may alternatively be sub-classed to fine-tune
 * behaviour.
 */
@InterfaceAudience.Public
@InterfaceStability.Evolving
public class DistCp extends Configured implements Tool {

    /**
     * Priority of the shutdown hook.
     */
    static final int SHUTDOWN_HOOK_PRIORITY = 30;

    static final Logger LOG = LoggerFactory.getLogger(DistCp.class);

    @VisibleForTesting
    DistCpContext context;

    private Path metaFolder;

    private static final String PREFIX = "_distcp";
    private static final String WIP_PREFIX = "._WIP_";
    private static final String DISTCP_DEFAULT_XML = "distcp-default.xml";
    private static final String DISTCP_SITE_XML = "distcp-site.xml";
    static final Random rand = new Random();

    private boolean submitted;
    private FileSystem jobFS;

    private void prepareFileListing(Job job) throws Exception {
        if (context.shouldUseSnapshotDiff()) {
            // When "-diff" or "-rdiff" is passed, do sync() first, then
            // create copyListing based on snapshot diff.
            DistCpSync distCpSync = new DistCpSync(context, getConf());
            if (distCpSync.sync()) {
                createInputFileListingWithDiff(job, distCpSync);
            } else {
                throw new Exception("DistCp sync failed, input options: " + context);
            }
        } else {
            // When no "-diff" or "-rdiff" is passed, create copyListing
            // in regular way.
            createInputFileListing(job);
        }
    }

    /**
     * Public Constructor. Creates DistCp object with specified input-parameters.
     * (E.g. source-paths, target-location, etc.)
     * @param configuration configuration against which the Copy-mapper must run
     * @param inputOptions Immutable options
     * @throws Exception
     */
    public DistCp(Configuration configuration, DistCpOptions inputOptions) throws Exception {
        Configuration config = new Configuration(configuration);
        config.addResource(DISTCP_DEFAULT_XML);
        config.addResource(DISTCP_SITE_XML);
        setConf(config);
        if (inputOptions != null) {
            this.context = new DistCpContext(inputOptions);
        }
        this.metaFolder = createMetaFolderPath();
    }

    /**
     * To be used with the ToolRunner. Not for public consumption.
     */
    @VisibleForTesting
    DistCp() {
    }

    /**
     * Implementation of Tool::run(). Orchestrates the copy of source file(s)
     * to target location, by:
     *  1. Creating a list of files to be copied to target.
     *  2. Launching a Map-only job to copy the files. (Delegates to execute().)
     * @param argv List of arguments passed to DistCp, from the ToolRunner.
     * @return On success, it returns 0. Else, -1.
     */
    @Override
    public int run(String[] argv) {
        if (argv.length < 1) {
            OptionsParser.usage();
            return DistCpConstants.INVALID_ARGUMENT;
        }

        try {
            context = new DistCpContext(OptionsParser.parse(argv));
            checkSplitLargeFile();
            setTargetPathExists();
            LOG.info("Input Options: " + context);
        } catch (Throwable e) {
            LOG.error("Invalid arguments: ", e);
            System.err.println("Invalid arguments: " + e.getMessage());
            OptionsParser.usage();
            return DistCpConstants.INVALID_ARGUMENT;
        }

        try {
            execute();
        } catch (InvalidInputException e) {
            LOG.error("Invalid input: ", e);
            return DistCpConstants.INVALID_ARGUMENT;
        } catch (DuplicateFileException e) {
            LOG.error("Duplicate files in input path: ", e);
            return DistCpConstants.DUPLICATE_INPUT;
        } catch (AclsNotSupportedException e) {
            LOG.error("ACLs not supported on at least one file system: ", e);
            return DistCpConstants.ACLS_NOT_SUPPORTED;
        } catch (XAttrsNotSupportedException e) {
            LOG.error("XAttrs not supported on at least one file system: ", e);
            return DistCpConstants.XATTRS_NOT_SUPPORTED;
        } catch (Exception e) {
            LOG.error("Exception encountered ", e);
            return DistCpConstants.UNKNOWN_ERROR;
        }
        return DistCpConstants.SUCCESS;
    }

    /**
     * Implements the core-execution. Creates the file-list for copy,
     * and launches the Hadoop-job, to do the copy.
     * @return Job handle
     * @throws Exception
     */
    public Job execute() throws Exception {
        Preconditions.checkState(context != null,
                "The DistCpContext should have been created before running DistCp!");
        Job job = createAndSubmitJob();

        if (context.shouldBlock()) {
            waitForJobCompletion(job);
        }
        return job;
    }

    /**
     * Create and submit the mapreduce job.
     * @return The mapreduce job object that has been submitted
     */
    public Job createAndSubmitJob() throws Exception {
        assert context != null;
        assert getConf() != null;
        Job job = null;
        try {
            synchronized (this) {
                //Don't cleanup while we are setting up.
                metaFolder = createMetaFolderPath();
                jobFS = metaFolder.getFileSystem(getConf());
                job = createJob();
            }
            prepareFileListing(job);
            job.submit();
            submitted = true;
        } finally {
            if (!submitted) {
                cleanup();
            }
        }

        String jobID = job.getJobID().toString();
        job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);
        LOG.info("DistCp job-id: " + jobID);

        return job;
    }

    /**
     * Wait for the given job to complete.
     * @param job the given mapreduce job that has already been submitted
     */
    public void waitForJobCompletion(Job job) throws Exception {
        assert job != null;
        if (!job.waitForCompletion(true)) {
            throw new IOException(
                    "DistCp failure: Job " + job.getJobID() + " has failed: " + job.getStatus().getFailureInfo());
        }
    }

    /**
     * Set targetPathExists in both inputOptions and job config,
     * for the benefit of CopyCommitter
     */
    private void setTargetPathExists() throws IOException {
        Path target = context.getTargetPath();
        FileSystem targetFS = target.getFileSystem(getConf());
        boolean targetExists = targetFS.exists(target);
        context.setTargetPathExists(targetExists);
        getConf().setBoolean(DistCpConstants.CONF_LABEL_TARGET_PATH_EXISTS, targetExists);
    }

    /**
     * Check splitting large files is supported and populate configs.
     */
    private void checkSplitLargeFile() throws IOException {
        if (!context.splitLargeFile()) {
            return;
        }

        final Path target = context.getTargetPath();
        final FileSystem targetFS = target.getFileSystem(getConf());
        try {
            Path[] src = null;
            Path tgt = null;
            targetFS.concat(tgt, src);
        } catch (UnsupportedOperationException use) {
            throw new UnsupportedOperationException(DistCpOptionSwitch.BLOCKS_PER_CHUNK.getSwitch()
                    + " is not supported since the target file system doesn't" + " support concat.", use);
        } catch (Exception e) {
            // Ignore other exception
        }

        LOG.info("Set " + DistCpConstants.CONF_LABEL_SIMPLE_LISTING_RANDOMIZE_FILES + " to false since "
                + DistCpOptionSwitch.BLOCKS_PER_CHUNK.getSwitch() + " is passed.");
        getConf().setBoolean(DistCpConstants.CONF_LABEL_SIMPLE_LISTING_RANDOMIZE_FILES, false);
    }

    /**
     * Create Job object for submitting it, with all the configuration
     *
     * @return Reference to job object.
     * @throws IOException - Exception if any
     */
    private Job createJob() throws IOException {
        String jobName = "distcp";
        String userChosenName = getConf().get(JobContext.JOB_NAME);
        if (userChosenName != null)
            jobName += ": " + userChosenName;
        Job job = Job.getInstance(getConf());
        job.setJobName(jobName);
        job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), context));
        job.setJarByClass(CopyMapper.class);
        configureOutputFormat(job);

        job.setMapperClass(CopyMapper.class);
        job.setNumReduceTasks(0);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputFormatClass(CopyOutputFormat.class);
        job.getConfiguration().set(JobContext.MAP_SPECULATIVE, "false");
        job.getConfiguration().set(JobContext.NUM_MAPS, String.valueOf(context.getMaxMaps()));

        context.appendToConf(job.getConfiguration());
        return job;
    }

    /**
     * Setup output format appropriately
     *
     * @param job - Job handle
     * @throws IOException - Exception if any
     */
    private void configureOutputFormat(Job job) throws IOException {
        final Configuration configuration = job.getConfiguration();
        Path targetPath = context.getTargetPath();
        FileSystem targetFS = targetPath.getFileSystem(configuration);
        targetPath = targetPath.makeQualified(targetFS.getUri(), targetFS.getWorkingDirectory());
        if (context.shouldPreserve(DistCpOptions.FileAttribute.ACL)) {
            DistCpUtils.checkFileSystemAclSupport(targetFS);
        }
        if (context.shouldPreserve(DistCpOptions.FileAttribute.XATTR)) {
            DistCpUtils.checkFileSystemXAttrSupport(targetFS);
        }
        if (context.shouldAtomicCommit()) {
            Path workDir = context.getAtomicWorkPath();
            if (workDir == null) {
                workDir = targetPath.getParent();
            }
            workDir = new Path(workDir, WIP_PREFIX + targetPath.getName() + rand.nextInt());
            FileSystem workFS = workDir.getFileSystem(configuration);
            if (!FileUtil.compareFs(targetFS, workFS)) {
                throw new IllegalArgumentException("Work path " + workDir + " and target path " + targetPath
                        + " are in different file system");
            }
            CopyOutputFormat.setWorkingDirectory(job, workDir);
        } else {
            CopyOutputFormat.setWorkingDirectory(job, targetPath);
        }
        CopyOutputFormat.setCommitDirectory(job, targetPath);

        Path logPath = context.getLogPath();
        if (logPath == null) {
            logPath = new Path(metaFolder, "_logs");
        } else {
            LOG.info("DistCp job log path: " + logPath);
        }
        CopyOutputFormat.setOutputPath(job, logPath);
    }

    /**
     * Create input listing by invoking an appropriate copy listing
     * implementation. Also add delegation tokens for each path
     * to job's credential store
     *
     * @param job - Handle to job
     * @return Returns the path where the copy listing is created
     * @throws IOException - If any
     */
    protected Path createInputFileListing(Job job) throws IOException {
        Path fileListingPath = getFileListingPath();
        CopyListing copyListing = CopyListing.getCopyListing(job.getConfiguration(), job.getCredentials(), context);
        copyListing.buildListing(fileListingPath, context);
        return fileListingPath;
    }

    /**
     * Create input listing based on snapshot diff report.
     * @param job - Handle to job
     * @param distCpSync the class wraps the snapshot diff report
     * @return Returns the path where the copy listing is created
     * @throws IOException - If any
     */
    private Path createInputFileListingWithDiff(Job job, DistCpSync distCpSync) throws IOException {
        Path fileListingPath = getFileListingPath();
        CopyListing copyListing = new SimpleCopyListing(job.getConfiguration(), job.getCredentials(), distCpSync);
        copyListing.buildListing(fileListingPath, context);
        return fileListingPath;
    }

    /**
     * Get default name of the copy listing file. Use the meta folder
     * to create the copy listing file
     *
     * @return - Path where the copy listing file has to be saved
     * @throws IOException - Exception if any
     */
    protected Path getFileListingPath() throws IOException {
        String fileListPathStr = metaFolder + "/fileList.seq";
        Path path = new Path(fileListPathStr);
        return new Path(path.toUri().normalize().toString());
    }

    /**
     * Create a default working folder for the job, under the
     * job staging directory
     *
     * @return Returns the working folder information
     * @throws Exception - Exception if any
     */
    private Path createMetaFolderPath() throws Exception {
        Configuration configuration = getConf();
        Path stagingDir = JobSubmissionFiles.getStagingDir(new Cluster(configuration), configuration);
        Path metaFolderPath = new Path(stagingDir, PREFIX + String.valueOf(rand.nextInt()));
        if (LOG.isDebugEnabled())
            LOG.debug("Meta folder location: " + metaFolderPath);
        configuration.set(DistCpConstants.CONF_LABEL_META_FOLDER, metaFolderPath.toString());
        return metaFolderPath;
    }

    /**
     * Main function of the DistCp program. Parses the input arguments (via OptionsParser),
     * and invokes the DistCp::run() method, via the ToolRunner.
     * @param argv Command-line arguments sent to DistCp.
     */
    public static void main(String argv[]) {
        int exitCode;
        try {
            DistCp distCp = new DistCp();
            Cleanup CLEANUP = new Cleanup(distCp);

            ShutdownHookManager.get().addShutdownHook(CLEANUP, SHUTDOWN_HOOK_PRIORITY);
            exitCode = ToolRunner.run(getDefaultConf(), distCp, argv);
        } catch (Exception e) {
            LOG.error("Couldn't complete DistCp operation: ", e);
            exitCode = DistCpConstants.UNKNOWN_ERROR;
        }
        System.exit(exitCode);
    }

    /**
     * Loads properties from distcp-default.xml into configuration
     * object
     * @return Configuration which includes properties from distcp-default.xml
     *         and distcp-site.xml
     */
    private static Configuration getDefaultConf() {
        Configuration config = new Configuration();
        config.addResource(DISTCP_DEFAULT_XML);
        config.addResource(DISTCP_SITE_XML);
        return config;
    }

    private synchronized void cleanup() {
        try {
            if (metaFolder != null) {
                if (jobFS != null) {
                    jobFS.delete(metaFolder, true);
                }
                metaFolder = null;
            }
        } catch (IOException e) {
            LOG.error("Unable to cleanup meta folder: " + metaFolder, e);
        }
    }

    private boolean isSubmitted() {
        return submitted;
    }

    private static class Cleanup implements Runnable {
        private final DistCp distCp;

        Cleanup(DistCp distCp) {
            this.distCp = distCp;
        }

        @Override
        public void run() {
            if (distCp.isSubmitted())
                return;

            distCp.cleanup();
        }
    }
}