org.apache.hadoop.tools.contract.AbstractContractDistCpTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.tools.contract.AbstractContractDistCpTest.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools.contract;

import static org.apache.hadoop.fs.contract.ContractTestUtils.*;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.contract.AbstractFSContractTestBase;
import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.tools.CopyListingFileStatus;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpConstants;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.tools.mapred.CopyMapper;

import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Contract test suite covering a file system's integration with DistCp.  The
 * tests coordinate two file system instances: one "local", which is the local
 * file system, and the other "remote", which is the file system implementation
 * under test.  The tests in the suite cover both copying from local to remote
 * (e.g. a backup use case) and copying from remote to local (e.g. a restore use
 * case).
 */
public abstract class AbstractContractDistCpTest extends AbstractFSContractTestBase {

    private static final Logger LOG = LoggerFactory.getLogger(AbstractContractDistCpTest.class);

    public static final String SCALE_TEST_DISTCP_FILE_SIZE_KB = "scale.test.distcp.file.size.kb";

    public static final int DEFAULT_DISTCP_SIZE_KB = 1024;

    protected static final int MB = 1024 * 1024;

    @Rule
    public TestName testName = new TestName();

    /**
     * The timeout value is extended over the default so that large updates
     * are allowed to take time, especially to remote stores.
     * @return the current test timeout
     */
    protected int getTestTimeoutMillis() {
        return 15 * 60 * 1000;
    }

    private Configuration conf;
    private FileSystem localFS, remoteFS;
    private Path localDir, remoteDir;

    private Path inputDir;

    private Path inputSubDir1;

    private Path inputSubDir2;

    private Path inputSubDir4;

    private Path inputFile1;

    private Path inputFile2;

    private Path inputFile3;

    private Path inputFile4;

    private Path inputFile5;

    private Path outputDir;

    private Path outputSubDir1;

    private Path outputSubDir2;

    private Path outputSubDir4;

    private Path outputFile1;

    private Path outputFile2;

    private Path outputFile3;

    private Path outputFile4;

    private Path outputFile5;

    private Path inputDirUnderOutputDir;

    @Override
    protected Configuration createConfiguration() {
        Configuration newConf = new Configuration();
        newConf.set("mapred.job.tracker", "local");
        return newConf;
    }

    @Before
    @Override
    public void setup() throws Exception {
        super.setup();
        conf = getContract().getConf();
        localFS = FileSystem.getLocal(conf);
        remoteFS = getFileSystem();
        // Test paths are isolated by concrete subclass name and test method name.
        // All paths are fully qualified including scheme (not taking advantage of
        // default file system), so if something fails, the messages will make it
        // clear which paths are local and which paths are remote.
        String className = getClass().getSimpleName();
        String testSubDir = className + "/" + testName.getMethodName();
        localDir = localFS
                .makeQualified(new Path(new Path(GenericTestUtils.getTestDir().toURI()), testSubDir + "/local"));
        mkdirs(localFS, localDir);
        remoteDir = path(testSubDir + "/remote");
        mkdirs(remoteFS, remoteDir);
        // test teardown does this, but IDE-based test debugging can skip
        // that teardown; this guarantees the initial state is clean
        remoteFS.delete(remoteDir, true);
        localFS.delete(localDir, true);
    }

    /**
     * Set up both input and output fields.
     * @param src source tree
     * @param dest dest tree
     */
    protected void initPathFields(final Path src, final Path dest) {
        initInputFields(src);
        initOutputFields(dest);
    }

    /**
     * Output field setup.
     * @param path path to set up
     */
    protected void initOutputFields(final Path path) {
        outputDir = new Path(path, "outputDir");
        inputDirUnderOutputDir = new Path(outputDir, "inputDir");
        outputFile1 = new Path(inputDirUnderOutputDir, "file1");
        outputSubDir1 = new Path(inputDirUnderOutputDir, "subDir1");
        outputFile2 = new Path(outputSubDir1, "file2");
        outputSubDir2 = new Path(inputDirUnderOutputDir, "subDir2/subDir2");
        outputFile3 = new Path(outputSubDir2, "file3");
        outputSubDir4 = new Path(inputDirUnderOutputDir, "subDir4/subDir4");
        outputFile4 = new Path(outputSubDir4, "file4");
        outputFile5 = new Path(outputSubDir4, "file5");
    }

    /**
     * this path setup is used across different methods (copy, update, track)
     * so they are set up as fields.
     * @param srcDir source directory for these to go under.
     */
    protected void initInputFields(final Path srcDir) {
        inputDir = new Path(srcDir, "inputDir");
        inputFile1 = new Path(inputDir, "file1");
        inputSubDir1 = new Path(inputDir, "subDir1");
        inputFile2 = new Path(inputSubDir1, "file2");
        inputSubDir2 = new Path(inputDir, "subDir2/subDir2");
        inputFile3 = new Path(inputSubDir2, "file3");
        inputSubDir4 = new Path(inputDir, "subDir4/subDir4");
        inputFile4 = new Path(inputSubDir4, "file4");
        inputFile5 = new Path(inputSubDir4, "file5");
    }

    protected FileSystem getLocalFS() {
        return localFS;
    }

    protected FileSystem getRemoteFS() {
        return remoteFS;
    }

    protected Path getLocalDir() {
        return localDir;
    }

    protected Path getRemoteDir() {
        return remoteDir;
    }

    @Test
    public void testUpdateDeepDirectoryStructureToRemote() throws Exception {
        describe("update a deep directory structure from local to remote");
        distCpDeepDirectoryStructure(localFS, localDir, remoteFS, remoteDir);
        distCpUpdateDeepDirectoryStructure(inputDirUnderOutputDir);
    }

    @Test
    public void testUpdateDeepDirectoryStructureNoChange() throws Exception {
        describe("update an unchanged directory structure" + " from local to remote; expect no copy");
        Path target = distCpDeepDirectoryStructure(localFS, localDir, remoteFS, remoteDir);
        describe("\nExecuting Update\n");
        Job job = distCpUpdate(localDir, target);
        assertCounterInRange(job, CopyMapper.Counter.SKIP, 1, -1);
        assertCounterInRange(job, CopyMapper.Counter.BYTESCOPIED, 0, 0);
    }

    /**
     * Assert that a counter is in a range; min and max values are inclusive.
     * @param job job to query
     * @param counter counter to examine
     * @param min min value, if negative "no minimum"
     * @param max max value, if negative "no maximum"
     * @throws IOException IO problem
     */
    void assertCounterInRange(Job job, Enum<?> counter, long min, long max) throws IOException {
        Counter c = job.getCounters().findCounter(counter);
        long value = c.getValue();
        String description = String.format("%s value %s", c.getDisplayName(), value, false);

        if (min >= 0) {
            assertTrue(description + " too below minimum " + min, value >= min);
        }
        if (max >= 0) {
            assertTrue(description + " above maximum " + max, value <= max);
        }
    }

    /**
     * Do a distcp from the local source to the destination filesystem.
     * This is executed as part of
     * {@link #testUpdateDeepDirectoryStructureToRemote()}; it's designed to be
     * overidden or wrapped by subclasses which wish to add more assertions.
     *
     * Life is complicated here by the way that the src/dest paths
     * on a distcp is different with -update.
     * @param destDir output directory used by the initial distcp
     * @return the distcp job
     */
    protected Job distCpUpdateDeepDirectoryStructure(final Path destDir) throws Exception {
        describe("Now do an incremental update with deletion of missing files");
        Path srcDir = inputDir;
        LOG.info("Source directory = {}, dest={}", srcDir, destDir);

        ContractTestUtils.assertPathsExist(localFS, "Paths for test are wrong", inputFile1, inputFile2, inputFile3,
                inputFile4, inputFile5);

        modifySourceDirectories();

        Job job = distCpUpdate(srcDir, destDir);

        Path outputFileNew1 = new Path(outputSubDir2, "newfile1");

        lsR("Updated Remote", remoteFS, destDir);

        ContractTestUtils.assertPathDoesNotExist(remoteFS, " deleted from " + inputFile1, outputFile1);
        ContractTestUtils.assertIsFile(remoteFS, outputFileNew1);
        ContractTestUtils.assertPathsDoNotExist(remoteFS, "DistCP should have deleted", outputFile3, outputFile4,
                outputSubDir4);
        assertCounterInRange(job, CopyMapper.Counter.COPY, 1, 1);
        assertCounterInRange(job, CopyMapper.Counter.SKIP, 1, -1);
        return job;
    }

    /**
     * Run distcp -update srcDir destDir.
     * @param srcDir local source directory
     * @param destDir remote destination directory.
     * @return the completed job
     * @throws Exception any failure.
     */
    private Job distCpUpdate(final Path srcDir, final Path destDir) throws Exception {
        describe("\nDistcp -update from " + srcDir + " to " + destDir);
        lsR("Local to update", localFS, srcDir);
        lsR("Remote before update", remoteFS, destDir);
        return runDistCp(
                buildWithStandardOptions(new DistCpOptions.Builder(Collections.singletonList(srcDir), destDir)
                        .withDeleteMissing(true).withSyncFolder(true).withCRC(true).withOverwrite(false)));
    }

    /**
     * Update the source directories as various tests expect,
     * including adding a new file.
     * @return the path to the newly created file
     * @throws IOException IO failure
     */
    private Path modifySourceDirectories() throws IOException {
        localFS.delete(inputFile1, false);
        localFS.delete(inputFile3, false);
        // delete all of subdir4, so input/output file 4 & 5 will go
        localFS.delete(inputSubDir4, true);
        // add one new file
        Path inputFileNew1 = new Path(inputSubDir2, "newfile1");
        ContractTestUtils.touch(localFS, inputFileNew1);
        return inputFileNew1;
    }

    @Test
    public void testTrackDeepDirectoryStructureToRemote() throws Exception {
        describe("copy a deep directory structure from local to remote");

        Path destDir = distCpDeepDirectoryStructure(localFS, localDir, remoteFS, remoteDir);
        ContractTestUtils.assertIsDirectory(remoteFS, destDir);

        describe("Now do an incremental update and save of missing files");
        Path srcDir = inputDir;
        // same path setup as in deepDirectoryStructure()
        Path trackDir = new Path(localDir, "trackDir");

        describe("\nDirectories\n");
        lsR("Local to update", localFS, srcDir);
        lsR("Remote before update", remoteFS, destDir);

        ContractTestUtils.assertPathsExist(localFS, "Paths for test are wrong", inputFile2, inputFile3, inputFile4,
                inputFile5);

        Path inputFileNew1 = modifySourceDirectories();

        // Distcp set to track but not delete
        runDistCp(buildWithStandardOptions(
                new DistCpOptions.Builder(Collections.singletonList(srcDir), inputDirUnderOutputDir)
                        .withTrackMissing(trackDir).withSyncFolder(true).withOverwrite(false)));

        lsR("tracked udpate", remoteFS, destDir);
        // new file went over
        Path outputFileNew1 = new Path(outputSubDir2, "newfile1");
        ContractTestUtils.assertIsFile(remoteFS, outputFileNew1);

        ContractTestUtils.assertPathExists(localFS, "tracking directory", trackDir);

        // now read in the listings
        Path sortedSourceListing = new Path(trackDir, DistCpConstants.SOURCE_SORTED_FILE);
        ContractTestUtils.assertIsFile(localFS, sortedSourceListing);
        Path sortedTargetListing = new Path(trackDir, DistCpConstants.TARGET_SORTED_FILE);
        ContractTestUtils.assertIsFile(localFS, sortedTargetListing);
        // deletion didn't happen
        ContractTestUtils.assertPathsExist(remoteFS, "DistCP should have retained", outputFile2, outputFile3,
                outputFile4, outputSubDir4);

        // now scan the table and see that things are there.
        Map<String, Path> sourceFiles = new HashMap<>(10);
        Map<String, Path> targetFiles = new HashMap<>(10);

        try (SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf,
                SequenceFile.Reader.file(sortedSourceListing));
                SequenceFile.Reader targetReader = new SequenceFile.Reader(conf,
                        SequenceFile.Reader.file(sortedTargetListing))) {
            CopyListingFileStatus copyStatus = new CopyListingFileStatus();
            Text name = new Text();
            while (sourceReader.next(name, copyStatus)) {
                String key = name.toString();
                Path path = copyStatus.getPath();
                LOG.info("{}: {}", key, path);
                sourceFiles.put(key, path);
            }
            while (targetReader.next(name, copyStatus)) {
                String key = name.toString();
                Path path = copyStatus.getPath();
                LOG.info("{}: {}", key, path);
                targetFiles.put(name.toString(), copyStatus.getPath());
            }
        }

        // look for the new file in both lists
        assertTrue("No " + outputFileNew1 + " in source listing", sourceFiles.containsValue(inputFileNew1));
        assertTrue("No " + outputFileNew1 + " in target listing", targetFiles.containsValue(outputFileNew1));
        assertTrue("No " + outputSubDir4 + " in target listing", targetFiles.containsValue(outputSubDir4));
        assertFalse("Found " + inputSubDir4 + " in source listing", sourceFiles.containsValue(inputSubDir4));

    }

    public void lsR(final String description, final FileSystem fs, final Path dir) throws IOException {
        RemoteIterator<LocatedFileStatus> files = fs.listFiles(dir, true);
        LOG.info("{}: {}:", description, dir);
        StringBuilder sb = new StringBuilder();
        while (files.hasNext()) {
            LocatedFileStatus status = files.next();
            sb.append(String.format("  %s; type=%s; length=%d", status.getPath(),
                    status.isDirectory() ? "dir" : "file", status.getLen()));
        }
        LOG.info("{}", sb);
    }

    @Test
    public void largeFilesToRemote() throws Exception {
        describe("copy multiple large files from local to remote");
        largeFiles(localFS, localDir, remoteFS, remoteDir);
    }

    @Test
    public void testDeepDirectoryStructureFromRemote() throws Exception {
        describe("copy a deep directory structure from remote to local");
        distCpDeepDirectoryStructure(remoteFS, remoteDir, localFS, localDir);
    }

    @Test
    public void testLargeFilesFromRemote() throws Exception {
        describe("copy multiple large files from remote to local");
        largeFiles(remoteFS, remoteDir, localFS, localDir);
    }

    /**
     * Executes a DistCp using a file system sub-tree with multiple nesting
     * levels.
     * The filenames are those of the fields initialized in setup.
     *
     * @param srcFS source FileSystem
     * @param srcDir source directory
     * @param dstFS destination FileSystem
     * @param dstDir destination directory
     * @return the target directory of the copy
     * @throws Exception if there is a failure
     */
    private Path distCpDeepDirectoryStructure(FileSystem srcFS, Path srcDir, FileSystem dstFS, Path dstDir)
            throws Exception {
        initPathFields(srcDir, dstDir);

        mkdirs(srcFS, inputSubDir1);
        mkdirs(srcFS, inputSubDir2);
        byte[] data1 = dataset(100, 33, 43);
        createFile(srcFS, inputFile1, true, data1);
        byte[] data2 = dataset(200, 43, 53);
        createFile(srcFS, inputFile2, true, data2);
        byte[] data3 = dataset(300, 53, 63);
        createFile(srcFS, inputFile3, true, data3);
        createFile(srcFS, inputFile4, true, dataset(400, 53, 63));
        createFile(srcFS, inputFile5, true, dataset(500, 53, 63));
        Path target = new Path(dstDir, "outputDir");
        runDistCp(inputDir, target);
        ContractTestUtils.assertIsDirectory(dstFS, target);
        lsR("Destination tree after distcp", dstFS, target);
        verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1);
        verifyFileContents(dstFS, new Path(target, "inputDir/subDir1/file2"), data2);
        verifyFileContents(dstFS, new Path(target, "inputDir/subDir2/subDir2/file3"), data3);
        return target;
    }

    /**
     * Executes a test using multiple large files.
     *
     * @param srcFS source FileSystem
     * @param srcDir source directory
     * @param dstFS destination FileSystem
     * @param dstDir destination directory
     * @throws Exception if there is a failure
     */
    private void largeFiles(FileSystem srcFS, Path srcDir, FileSystem dstFS, Path dstDir) throws Exception {
        initPathFields(srcDir, dstDir);
        Path largeFile1 = new Path(inputDir, "file1");
        Path largeFile2 = new Path(inputDir, "file2");
        Path largeFile3 = new Path(inputDir, "file3");
        mkdirs(srcFS, inputDir);
        int fileSizeKb = conf.getInt(SCALE_TEST_DISTCP_FILE_SIZE_KB, DEFAULT_DISTCP_SIZE_KB);
        int fileSizeMb = fileSizeKb / 1024;
        getLogger().info("{} with file size {}", testName.getMethodName(), fileSizeMb);
        byte[] data1 = dataset((fileSizeMb + 1) * MB, 33, 43);
        createFile(srcFS, largeFile1, true, data1);
        byte[] data2 = dataset((fileSizeMb + 2) * MB, 43, 53);
        createFile(srcFS, largeFile2, true, data2);
        byte[] data3 = dataset((fileSizeMb + 3) * MB, 53, 63);
        createFile(srcFS, largeFile3, true, data3);
        Path target = new Path(dstDir, "outputDir");
        runDistCp(inputDir, target);
        ContractTestUtils.assertIsDirectory(dstFS, target);
        verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1);
        verifyFileContents(dstFS, new Path(target, "inputDir/file2"), data2);
        verifyFileContents(dstFS, new Path(target, "inputDir/file3"), data3);
    }

    /**
     * Executes DistCp and asserts that the job finished successfully.
     *
     * @param src source path
     * @param dst destination path
     * @throws Exception if there is a failure
     */
    private void runDistCp(Path src, Path dst) throws Exception {
        runDistCp(buildWithStandardOptions(new DistCpOptions.Builder(Collections.singletonList(src), dst)));
    }

    /**
     * Run the distcp job.
     * @param optons distcp options
     * @return the job. It will have already completed.
     * @throws Exception failure
     */
    private Job runDistCp(final DistCpOptions options) throws Exception {
        Job job = new DistCp(conf, options).execute();
        assertNotNull("Unexpected null job returned from DistCp execution.", job);
        assertTrue("DistCp job did not complete.", job.isComplete());
        assertTrue("DistCp job did not complete successfully.", job.isSuccessful());
        return job;
    }

    /**
     * Add any standard options and then build.
     * @param builder DistCp option builder
     * @return the build options
     */
    private DistCpOptions buildWithStandardOptions(DistCpOptions.Builder builder) {
        return builder.withNumListstatusThreads(DistCpOptions.MAX_NUM_LISTSTATUS_THREADS).build();
    }

    /**
     * Creates a directory and any ancestor directories required.
     *
     * @param fs FileSystem in which to create directories
     * @param dir path of directory to create
     * @throws Exception if there is a failure
     */
    private static void mkdirs(FileSystem fs, Path dir) throws Exception {
        assertTrue("Failed to mkdir " + dir, fs.mkdirs(dir));
    }
}