com.inmobi.conduit.distcp.tools.CopyListing.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.conduit.distcp.tools.CopyListing.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.inmobi.conduit.distcp.tools;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.Credentials;

import com.inmobi.conduit.distcp.tools.util.DistCpUtils;

import java.io.IOException;

/**
 * The CopyListing abstraction is responsible for how the list of
 * sources and targets is constructed, for DistCp's copy function.
 * The copy-listing should be a SequenceFile<Text, FileStatus>,
 * located at the path specified to buildListing(),
 * each entry being a pair of (Source relative path, source file status),
 * all the paths being fully qualified.
 */
public abstract class CopyListing extends Configured {

    private Credentials credentials;

    /**
     * Build listing function creates the input listing that distcp uses to
     * perform the copy.
     *
     * The build listing is a sequence file that has relative path of a file in the key
     * and the file status information of the source file in the value
     *
     * For instance if the source path is /tmp/data and the traversed path is
     * /tmp/data/dir1/dir2/file1, then the sequence file would contain
     *
     * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
     *
     * File would also contain directory entries. Meaning, if /tmp/data/dir1/dir2/file1
     * is the only file under /tmp/data, the resulting sequence file would contain the
     * following entries
     *
     * key: /dir1 and value: FileStatus(/tmp/data/dir1)
     * key: /dir1/dir2 and value: FileStatus(/tmp/data/dir1/dir2)
     * key: /dir1/dir2/file1 and value: FileStatus(/tmp/data/dir1/dir2/file1)
     *
     * Cases requiring special handling:
     * If source path is a file (/tmp/file1), contents of the file will be as follows
     *
     * TARGET DOES NOT EXIST: Key-"", Value-FileStatus(/tmp/file1)
     * TARGET IS FILE       : Key-"", Value-FileStatus(/tmp/file1)
     * TARGET IS DIR        : Key-"/file1", Value-FileStatus(/tmp/file1)
     *
     * @param pathToListFile - Output file where the listing would be stored
     * @param options - Input options to distcp
     * @throws IOException - Exception if any
     */
    public final void buildListing(Path pathToListFile, DistCpOptions options) throws IOException {
        validatePaths(options);
        doBuildListing(pathToListFile, options);
        Configuration config = getConf();

        config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, pathToListFile.toString());
        config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, getBytesToCopy());
        config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, getNumberOfPaths());

        checkForDuplicates(pathToListFile);
    }

    /**
     * Validate input and output paths
     *
     * @param options - Input options
     * @throws InvalidInputException: If inputs are invalid
     * @throws IOException: any Exception with FS
     */
    protected abstract void validatePaths(DistCpOptions options) throws IOException, InvalidInputException;

    /**
     * The interface to be implemented by sub-classes, to create the source/target file listing.
     * @param pathToListFile: Path on HDFS where the listing file is written.
     * @param options: Input Options for DistCp (indicating source/target paths.)
     * @throws IOException: Thrown on failure to create the listing file.
     */
    protected abstract void doBuildListing(Path pathToListFile, DistCpOptions options) throws IOException;

    /**
     * Return the total bytes that distCp should copy for the source paths
     * This doesn't consider whether file is same should be skipped during copy
     *
     * @return total bytes to copy
     */
    protected abstract long getBytesToCopy();

    /**
     * Return the total number of paths to distcp, includes directories as well
     * This doesn't consider whether file/dir is already present and should be skipped during copy
     *
     * @return Total number of paths to distcp
     */
    protected abstract long getNumberOfPaths();

    /**
     * Validate the final resulting path listing to see if there are any duplicate entries
     *
     * @param pathToListFile - path listing build by doBuildListing
     * @throws IOException - Any issues while checking for duplicates and throws
     * @throws DuplicateFileException - if there are duplicates
     */
    protected void checkForDuplicates(Path pathToListFile) throws DuplicateFileException, IOException {

        Configuration config = getConf();
        FileSystem fs = pathToListFile.getFileSystem(config);

        Path sortedList = DistCpUtils.sortListing(fs, config, pathToListFile);

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, sortedList, config);
        try {
            Text lastKey = new Text("*"); //source relative path can never hold *
            FileStatus lastFileStatus = new FileStatus();

            Text currentKey = new Text();
            while (reader.next(currentKey)) {
                if (currentKey.equals(lastKey)) {
                    FileStatus currentFileStatus = new FileStatus();
                    reader.getCurrentValue(currentFileStatus);
                    throw new DuplicateFileException("File " + lastFileStatus.getPath() + " and "
                            + currentFileStatus.getPath() + " would cause duplicates. Aborting");
                }
                reader.getCurrentValue(lastFileStatus);
                lastKey.set(currentKey);
            }
        } finally {
            IOUtils.closeStream(reader);
        }
    }

    /**
     * Protected constructor, to initialize configuration.
     * @param configuration: The input configuration,
     *                        with which the source/target FileSystems may be accessed.
     * @param credentials - Credentials object on which the FS delegation tokens are cached.If null
     * delegation token caching is skipped
     */
    protected CopyListing(Configuration configuration, Credentials credentials) {
        setConf(configuration);
        setCredentials(credentials);
    }

    /**
     * set Credentials store, on which FS delegatin token will be cached
     * @param credentials - Credentials object
     */
    protected void setCredentials(Credentials credentials) {
        this.credentials = credentials;
    }

    /**
     * get credentials to update the delegation tokens for accessed FS objects
     * @return Credentials object
     */
    protected Credentials getCredentials() {
        return credentials;
    }

    /**
     * Public Factory method with which the appropriate CopyListing implementation may be retrieved.
     * @param configuration: The input configuration.
     * @param credentials - Credentials object on which the FS delegation tokens are cached
     * @param options: The input Options, to help choose the appropriate CopyListing Implementation.
     * @return An instance of the appropriate CopyListing implementation.
     */
    public static CopyListing getCopyListing(Configuration configuration, Credentials credentials,
            DistCpOptions options) {
        if (options.isUseSimpleFileListing()) {
            return new SimpleFileBasedCopyListing(configuration, credentials);
        } else if (options.getSourceFileListing() == null) {
            return new GlobbedCopyListing(configuration, credentials);
        } else {
            return new FileBasedCopyListing(configuration, credentials);
        }

    }

    static class DuplicateFileException extends RuntimeException {
        public DuplicateFileException(String message) {
            super(message);
        }
    }

    static class InvalidInputException extends RuntimeException {
        public InvalidInputException(String message) {
            super(message);
        }
    }
}