org.pentaho.di.job.entries.hadooptransjobexecutor.DistributedCacheUtil.java Source code

Introduction

Here is the source code for org.pentaho.di.job.entries.hadooptransjobexecutor.DistributedCacheUtil.java
Source

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2012 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.di.job.entries.hadooptransjobexecutor;

import org.apache.commons.vfs.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.VersionInfo;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.plugins.PluginFolder;
import org.pentaho.di.core.plugins.PluginFolderInterface;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;

import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

/**
 * Utility to work with Hadoop's Distributed Cache
 */
public class DistributedCacheUtil {
    /**
     * Default buffer size when compressing/uncompressing files.
     */
    private static final int DEFAULT_BUFFER_SIZE = 8192;

    /**
     * Pattern to match files ending in ".jar"
     */
    private static final Pattern JAR_FILES = Pattern.compile(".*\\.jar$");
    /**
     * Pattern to match all files that are not in the lib/ directory. Matches any string that does not contain /lib
     */
    private static final Pattern NOT_LIB_FILES = Pattern.compile("^((?!/lib).)*$");

    /**
     * Default permission for cached files
     * <p/>
     * Not using FsPermission.createImmutable due to EOFExceptions when using it with Hadoop 0.20.2
     */
    private static final FsPermission CACHED_FILE_PERMISSION = new FsPermission((short) 0755);

    /**
     * Name of the Big Data Plugin folder
     */
    public static final String PENTAHO_BIG_DATA_PLUGIN_FOLDER_NAME = "pentaho-big-data-plugin";

    /**
     * Creates the path to a lock file within the provided directory
     *
     * @param dir Directory to generate lock file path within
     * @return Path to lock file within {@code dir}
     */
    public Path getLockFileAt(Path dir) {
        return new Path(dir, ".lock");
    }

    /**
     * This validates that the Kettle Environment is installed. "Installed" means the kettle engine and supporting jars/plugins exist in
     * the provided file system at the path provided.
     *
     * @param fs   File System to check for the Kettle Environment in
     * @param root Root path the Kettle Environment should reside within
     * @return True if the Kettle Environment is installed at {@code root}.
     * @throws IOException Error investigating installation
     */
    public boolean isKettleEnvironmentInstalledAt(FileSystem fs, Path root) throws IOException {
        // These directories must exist
        Path[] directories = new Path[] { new Path(root, "lib"), new Path(root, "plugins"),
                new Path(new Path(root, "plugins"), PENTAHO_BIG_DATA_PLUGIN_FOLDER_NAME) };
        // This file must not exist
        Path lock = getLockFileAt(root);
        // These directories must exist
        for (Path dir : directories) {
            if (!(fs.exists(dir) && fs.getFileStatus(dir).isDir())) {
                return false;
            }
        }
        // There's no lock file
        return !fs.exists(lock);
    }

    public void installKettleEnvironment(FileObject pmrArchive, FileSystem fs, Path destination,
            FileObject bigDataPlugin, List<FileObject> additionalPluginDirectories)
            throws IOException, KettleFileException {
        if (pmrArchive == null) {
            throw new NullPointerException("pmrArchive is required");
        }
        if (destination == null) {
            throw new NullPointerException("destination is required");
        }
        if (bigDataPlugin == null) {
            throw new NullPointerException("big data plugin required");
        }

        FileObject extracted = extractToTemp(pmrArchive);

        // Write lock file while we're installing
        Path lockFile = getLockFileAt(destination);
        fs.create(lockFile, true);

        stageForCache(extracted, fs, destination, true);

        List<FileObject> pluginsDirectories = new ArrayList<FileObject>();
        pluginsDirectories.add(bigDataPlugin);
        if (additionalPluginDirectories != null && !additionalPluginDirectories.isEmpty()) {
            pluginsDirectories.addAll(additionalPluginDirectories);
        }

        stagePluginsForCache(fs, new Path(destination, "plugins"), true, pluginsDirectories);

        // Delete the lock file now that we're done. It is intentional that we're not doing this in a try/finally. If the
        // staging fails for some reason we require the user to forcibly overwrite the (partial) installation
        fs.delete(lockFile, true);
    }

    public void stagePluginsForCache(FileSystem fs, Path pluginsDir, boolean overwrite,
            List<FileObject> pluginDirectories) throws KettleFileException, IOException {
        if (pluginDirectories == null) {
            throw new IllegalArgumentException("plugins required");
        }
        if (!fs.exists(pluginsDir)) {
            fs.mkdirs(pluginsDir);
        }
        for (FileObject localPluginDir : pluginDirectories) {
            if (!localPluginDir.exists()) {
                throw new KettleFileException(BaseMessages.getString(DistributedCacheUtil.class,
                        "DistributedCacheUtil.PluginDirectoryNotFound", localPluginDir));
            }
            Path pluginDir = new Path(pluginsDir, localPluginDir.getName().getBaseName());
            if (!overwrite && fs.exists(pluginDir)) {
                // skip installing this plugin, it already exists
                continue;
            }
            stageForCache(localPluginDir, fs, pluginDir, true);
        }
    }

    /**
     * Configure the provided configuration to use the Distributed Cache and include all files in {@code kettleInstallDir}.
     * All jar files in lib/ will be added to the classpath.
     *
     * @param conf             Configuration to update
     * @param fs               File system to load Kettle Environment installation from
     * @param kettleInstallDir Directory that contains the Kettle installation to use in the file system provided
     * @throws KettleFileException
     * @throws IOException
     */
    public void configureWithKettleEnvironment(Configuration conf, FileSystem fs, Path kettleInstallDir)
            throws KettleFileException, IOException {
        Path libDir = new Path(kettleInstallDir, "lib");
        List<Path> libraryJars = findFiles(fs, libDir, JAR_FILES);
        addCachedFilesToClasspath(libraryJars, conf);

        List<Path> nonLibFiles = findFiles(fs, kettleInstallDir, NOT_LIB_FILES);
        addCachedFiles(nonLibFiles, conf);
    }

    /**
     * Register a list of files from a Hadoop file system to be available and placed on the classpath when the configuration
     * is used to submit Hadoop jobs
     *
     * @param files Paths to add to the classpath of the configuration provided
     * @param conf  Configuration to modify
     * @throws IOException
     */
    public void addCachedFilesToClasspath(List<Path> files, Configuration conf) throws IOException {
        DistributedCache.createSymlink(conf);
        for (Path file : files) {
            // We need to disqualify the path so Distributed Cache in 0.20.2 can properly add the resources to
            // the classpath: https://issues.apache.org/jira/browse/MAPREDUCE-752
            addFileToClassPath(disqualifyPath(file), conf);
        }
    }

    /**
     * Add an file path to the current set of classpath entries. It adds the file
     * to cache as well.
     *
     * This is copied from Hadoop 0.20.2 o.a.h.filecache.DistributedCache so we can inject the correct path separator
     * for the environment the cluster is executing in. See {@link #getClusterPathSeparator()}.
     *
     * @param file Path of the file to be added
     * @param conf Configuration that contains the classpath setting
     */
    public void addFileToClassPath(Path file, Configuration conf) throws IOException {

        // TODO Replace this with a Hadoop shim if we end up having version-specific implementations scattered around
        if (VersionInfo.getVersion().contains("0.21")) {
            DistributedCache.addFileToClassPath(file, conf);
        } else {
            String classpath = conf.get("mapred.job.classpath.files");
            conf.set("mapred.job.classpath.files",
                    classpath == null ? file.toString() : classpath + getClusterPathSeparator() + file.toString());
            FileSystem fs = FileSystem.get(conf);
            URI uri = fs.makeQualified(file).toUri();

            DistributedCache.addCacheFile(uri, conf);
        }
    }

    /**
     * Register a list of paths from a Hadoop file system to be available when the configuration is used to submit Hadoop jobs
     *
     * @param paths Paths to add to the list of cached paths for the configuration provided
     * @param conf  Configuration to modify
     * @throws IOException
     */
    public void addCachedFiles(List<Path> paths, Configuration conf) throws IOException {
        DistributedCache.createSymlink(conf);
        for (Path path : paths) {
            // Build a URI and set the path's short name in the fragment so the file is copied properly
            DistributedCache.addCacheFile(URI.create(path.toUri() + "#" + path.getName()), conf);
        }
    }

    /**
     * Removes the schema, host, and authentication portion of a path in its URI.
     *
     * @param path Path to cleanse
     * @return New path relative to the root of the filesystem
     */
    public Path disqualifyPath(Path path) {
        return new Path(path.toUri().getPath());
    }

    /**
     * Stages the source file or folder to a Hadoop file system and sets their permission and replication value appropriately
     * to be used with the Distributed Cache. WARNING: This will delete the contents of dest before staging the archive.
     *
     * @param source    File or folder to copy to the file system. If it is a folder all contents will be copied into dest.
     * @param fs        Hadoop file system to store the contents of the archive in
     * @param dest      Destination to copy source into. If source is a file, the new file name will be exactly dest. If source
     *                  is a folder its contents will be copied into dest. For more info see
     *                  {@link FileSystem#copyFromLocalFile(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path)}.
     * @param overwrite Should an existing file or folder be overwritten? If not an exception will be thrown.
     * @throws IOException         Destination exists is not a directory
     * @throws KettleFileException Source does not exist or destination exists and overwrite is false.
     */
    public void stageForCache(FileObject source, FileSystem fs, Path dest, boolean overwrite)
            throws IOException, KettleFileException {
        if (!source.exists()) {
            throw new KettleFileException(BaseMessages.getString(DistributedCacheUtil.class,
                    "DistributedCacheUtil.SourceDoesNotExist", source));
        }

        if (fs.exists(dest)) {
            if (overwrite) {
                // It is a directory, clear it out
                fs.delete(dest, true);
            } else {
                throw new KettleFileException(BaseMessages.getString(DistributedCacheUtil.class,
                        "DistributedCacheUtil.DestinationExists", dest.toUri().getPath()));
            }
        }

        // Use the same replication we'd use for submitting jobs
        short replication = (short) fs.getConf().getInt("mapred.submit.replication", 10);

        Path local = new Path(source.getURL().getPath());
        fs.copyFromLocalFile(local, dest);
        fs.setPermission(dest, CACHED_FILE_PERMISSION);
        fs.setReplication(dest, replication);
    }

    /**
     * Recursively searches for all files starting at the directory provided with the extension provided. If no extension
     * is provided all files will be returned.
     *
     * @param root      Directory to start the search for files in
     * @param extension File extension to search for. If null all files will be returned.
     * @return List of absolute path names to all files found in {@code dir} and its subdirectories.
     * @throws KettleFileException
     * @throws FileSystemException
     */
    public List<String> findFiles(FileObject root, final String extension) throws FileSystemException {
        FileObject[] files = root.findFiles(new FileSelector() {
            @Override
            public boolean includeFile(FileSelectInfo fileSelectInfo) throws Exception {
                return extension == null || extension.equals(fileSelectInfo.getFile().getName().getExtension());
            }

            @Override
            public boolean traverseDescendents(FileSelectInfo fileSelectInfo) throws Exception {
                return FileType.FOLDER.equals(fileSelectInfo.getFile().getType());
            }
        });

        if (files == null) {
            return Collections.EMPTY_LIST;
        }

        List<String> paths = new ArrayList<String>();
        for (FileObject file : files) {
            try {
                paths.add(file.getURL().toURI().getPath());
            } catch (URISyntaxException ex) {
                throw new FileSystemException("Error getting URI of file: " + file.getURL().getPath());
            }
        }
        return paths;
    }

    /**
     * Looks for all files in the path within the given file system that match the pattern provided. Only the direct
     * descendants of {@code path} will be evaluated; this is not recursive.
     * 
     * @param fs File system to search within
     * @param path Path to search in
     * @param fileNamePattern Pattern of file name to match. If {@code null}, all files will be matched.
     * @return All {@link Path}s that match the provided pattern.
     * @throws IOException Error retrieving listing status of a path from the file system
     */
    public List<Path> findFiles(FileSystem fs, Path path, Pattern fileNamePattern) throws IOException {
        FileStatus[] files = fs.listStatus(path);
        List<Path> found = new ArrayList<Path>(files.length);
        for (FileStatus file : files) {
            if (fileNamePattern == null || fileNamePattern.matcher(file.getPath().toString()).matches()) {
                found.add(file.getPath());
            }
        }
        return found;
    }

    /**
     * Delete a directory and all of its contents
     *
     * @param dir Directory to delete
     * @return True if the directory was deleted successfully
     */
    public boolean deleteDirectory(FileObject dir) throws FileSystemException {
        dir.delete(new AllFileSelector());
        return !dir.exists();
    }

    /**
     * Extract a zip archive to a temp directory.
     *
     * @param archive Zip archive to extract
     * @return Directory the zip was extracted into
     * @throws IOException
     * @throws KettleFileException
     * @see DistributedCacheUtil#extract(org.apache.commons.vfs.FileObject, org.apache.commons.vfs.FileObject)
     */
    public FileObject extractToTemp(FileObject archive) throws IOException, KettleFileException {
        if (archive == null) {
            throw new NullPointerException("archive is required");
        }
        // Ask KettleVFS for a temporary file name without extension and use that as our temporary folder to extract into
        FileObject dest = KettleVFS.createTempFile("", "", System.getProperty("java.io.tmpdir"));
        return extract(archive, dest);
    }

    /**
     * Extract a zip archive to a directory.
     *
     * @param archive Zip archive to extract
     * @param dest    Destination directory. This must not exist!
     * @return Directory the zip was extracted into
     * @throws IllegalArgumentException when the archive file does not exist or the destination directory already exists
     * @throws IOException
     * @throws KettleFileException
     */
    public FileObject extract(FileObject archive, FileObject dest) throws IOException, KettleFileException {
        if (!archive.exists()) {
            throw new IllegalArgumentException("archive does not exist: " + archive.getURL().getPath());
        }

        if (dest.exists()) {
            throw new IllegalArgumentException("destination already exists");
        }
        dest.createFolder();

        try {
            byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
            int len = 0;
            ZipInputStream zis = new ZipInputStream(archive.getContent().getInputStream());
            try {
                ZipEntry ze;
                while ((ze = zis.getNextEntry()) != null) {
                    FileObject entry = KettleVFS.getFileObject(dest + Const.FILE_SEPARATOR + ze.getName());

                    if (ze.isDirectory()) {
                        entry.createFolder();
                        continue;
                    }

                    OutputStream os = KettleVFS.getOutputStream(entry, false);
                    try {
                        while ((len = zis.read(buffer)) > 0) {
                            os.write(buffer, 0, len);
                        }
                    } finally {
                        if (os != null) {
                            os.close();
                        }
                    }
                }
            } finally {
                if (zis != null) {
                    zis.close();
                }
            }
        } catch (Exception ex) {
            // Try to clean up the temp directory and all files
            if (!deleteDirectory(dest)) {
                throw new KettleFileException("Could not clean up temp dir after error extracting", ex);
            }
            throw new KettleFileException("error extracting archive", ex);
        }

        return dest;
    }

    /**
     * Attempts to find a plugin's installation folder on disk within all known plugin folder locations
     *
     * @param pluginFolderName Name of plugin folder
     * @return Location of the first plugin folder found as a direct descendant of one of the known plugin folder locations
     * @throws KettleFileException Error getting plugin folders
     */
    public FileObject findPluginFolder(final String pluginFolderName) throws KettleFileException {
        List<PluginFolderInterface> pluginFolders = PluginFolder.populateFolders(null);
        if (pluginFolders != null) {
            for (PluginFolderInterface pluginFolder : pluginFolders) {
                FileObject folder = KettleVFS.getFileObject(pluginFolder.getFolder());

                try {
                    if (folder.exists()) {
                        FileObject[] files = folder.findFiles(new FileSelector() {
                            @Override
                            public boolean includeFile(FileSelectInfo fileSelectInfo) throws Exception {
                                if (fileSelectInfo.getFile().equals(fileSelectInfo.getBaseFolder())) {
                                    // Do not consider the base folders
                                    return false;
                                }
                                // Determine relative name to compare
                                int baseNameLength = fileSelectInfo.getBaseFolder().getName().getPath().length()
                                        + 1;
                                String relativeName = fileSelectInfo.getFile().getName().getPath()
                                        .substring(baseNameLength);
                                // Compare plugin folder name with the relative name
                                return pluginFolderName.equals(relativeName);
                            }

                            @Override
                            public boolean traverseDescendents(FileSelectInfo fileSelectInfo) throws Exception {
                                return true;
                            }
                        });
                        if (files != null && files.length > 0) {
                            return files[0]; // Return the first match
                        }
                    }
                } catch (FileSystemException ex) {
                    throw new KettleFileException("Error searching for folder '" + pluginFolderName + "'", ex);
                }
            }
        }
        return null;
    }

    /**
     * Determine the class path separator of the cluster. For now there is no way to determine the remote cluster's path
     * separator nor would we want to ultimately do that. This can be configured externally with the system property
     * "hadoop.cluster.path.separator". This will default to ":" if the system property is not set.
     *
     * This is not necessary for Hadoop 0.21.x. See https://issues.apache.org/jira/browse/HADOOP-4864.
     *
     * @return Path separator to use when building up the classpath to use for the Distributed Cache
     */
    public String getClusterPathSeparator() {
        return System.getProperty("hadoop.cluster.path.separator", ":");
    }
}