com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider.java

Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetExistsException;
import com.cloudera.cdk.data.MetadataProvider;
import com.cloudera.cdk.data.MetadataProviderException;
import com.cloudera.cdk.data.impl.Accessor;
import com.cloudera.cdk.data.spi.AbstractMetadataProvider;
import com.google.common.base.Charsets;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import java.io.FileNotFoundException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;

/**
 * <p>
 * A {@link MetadataProvider} that stores dataset metadata in a Hadoop
 * {@link FileSystem}.
 * </p>
 * <p>
 * When configured with a root directory, this implementation serializes the
 * information within a {@link com.cloudera.cdk.data.DatasetDescriptor} on the provided
 * {@link FileSystem}. The descriptor is serialized as an Avro object and stored
 * in a directory named after the dataset name. For example, if the dataset name
 * is {@code logs}, the directory {@code rootDirectory/logs/} will be created,
 * if it doesn't exist, and the serialized descriptor will be stored in the file
 * {@code descriptor.avro}.
 * </p>
 */
public class FileSystemMetadataProvider extends AbstractMetadataProvider {

    private static final Logger logger = LoggerFactory.getLogger(FileSystemMetadataProvider.class);

    private static final String METADATA_DIRECTORY = ".metadata";
    private static final String SCHEMA_FILE_NAME = "schema.avsc";
    private static final String DESCRIPTOR_FILE_NAME = "descriptor.properties";
    private static final String PARTITION_EXPRESSION_FIELD_NAME = "partitionExpression";
    private static final String VERSION_FIELD_NAME = "version";
    private static final String METADATA_VERSION = "1";
    private static final String FORMAT_FIELD_NAME = "format";
    private static final String LOCATION_FIELD_NAME = "location";

    private static final Set<String> RESERVED_PROPERTIES = Sets.newHashSet(PARTITION_EXPRESSION_FIELD_NAME,
            VERSION_FIELD_NAME, FORMAT_FIELD_NAME, LOCATION_FIELD_NAME);

    private final Configuration conf;
    private final Path rootDirectory;

    // cache the rootDirectory's FileSystem to avoid multiple lookups
    private transient final FileSystem rootFileSystem;

    public FileSystemMetadataProvider(Configuration conf, Path rootDirectory) {
        Preconditions.checkArgument(conf != null, "Configuration cannot be null");
        Preconditions.checkArgument(rootDirectory != null, "Root cannot be null");

        this.conf = conf;
        try {
            this.rootFileSystem = rootDirectory.getFileSystem(conf);
            this.rootDirectory = rootFileSystem.makeQualified(rootDirectory);
        } catch (IOException ex) {
            throw new MetadataProviderException("Cannot get FileSystem for root path", ex);
        }
    }

    @Override
    public DatasetDescriptor load(String name) {
        Preconditions.checkArgument(name != null, "Name cannot be null");

        logger.debug("Loading dataset metadata name:{}", name);

        final Path metadataPath = pathForMetadata(name);
        checkExists(rootFileSystem, metadataPath);

        InputStream inputStream = null;
        Properties properties = new Properties();
        DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();
        Path descriptorPath = new Path(metadataPath, DESCRIPTOR_FILE_NAME);

        boolean threw = true;
        try {
            inputStream = rootFileSystem.open(descriptorPath);
            properties.load(inputStream);
            threw = false;
        } catch (IOException e) {
            throw new MetadataProviderException(
                    "Unable to load descriptor file:" + descriptorPath + " for dataset:" + name, e);
        } finally {
            try {
                Closeables.close(inputStream, threw);
            } catch (IOException e) {
                throw new MetadataProviderException(e);
            }
        }

        if (properties.containsKey(FORMAT_FIELD_NAME)) {
            builder.format(Accessor.getDefault().newFormat(properties.getProperty(FORMAT_FIELD_NAME)));
        }
        if (properties.containsKey(PARTITION_EXPRESSION_FIELD_NAME)) {
            builder.partitionStrategy(
                    Accessor.getDefault().fromExpression(properties.getProperty(PARTITION_EXPRESSION_FIELD_NAME)));
        }
        Path schemaPath = new Path(metadataPath, SCHEMA_FILE_NAME);
        try {
            builder.schemaUri(rootFileSystem.makeQualified(schemaPath).toUri());
        } catch (IOException e) {
            throw new MetadataProviderException("Unable to load schema file:" + schemaPath + " for dataset:" + name,
                    e);
        }

        final Path location;
        if (properties.containsKey(LOCATION_FIELD_NAME)) {
            // the location should always be written by this library and validated
            // when the descriptor is first created.
            location = new Path(properties.getProperty(LOCATION_FIELD_NAME));
        } else {
            // backwards-compatibility: older versions didn't write this property
            location = pathForDataset(name);
        }
        builder.location(location);

        // custom properties
        for (String property : properties.stringPropertyNames()) {
            if (!RESERVED_PROPERTIES.contains(property)) {
                builder.property(property, properties.getProperty(property));
            }
        }

        return builder.build();
    }

    @Override
    public DatasetDescriptor create(String name, DatasetDescriptor descriptor) {
        Preconditions.checkArgument(name != null, "Name cannot be null");
        Preconditions.checkArgument(descriptor != null, "Descriptor cannot be null");

        logger.debug("Saving dataset metadata name:{} descriptor:{}", name, descriptor);

        final Path dataLocation;

        // If the descriptor has a location, use it.
        if (descriptor.getLocation() != null) {
            dataLocation = new Path(descriptor.getLocation());
        } else {
            dataLocation = pathForDataset(name);
        }

        final Path metadataLocation = pathForMetadata(name);

        // get a DatasetDescriptor with the location set
        DatasetDescriptor newDescriptor = new DatasetDescriptor.Builder(descriptor).location(dataLocation).build();

        try {
            if (rootFileSystem.exists(metadataLocation)) {
                throw new DatasetExistsException("Descriptor directory:" + metadataLocation + " already exists");
            }
            // create the directory so that update can do the rest of the work
            rootFileSystem.mkdirs(metadataLocation);
        } catch (IOException e) {
            throw new MetadataProviderException(
                    "Unable to create metadata directory:" + metadataLocation + " for dataset:" + name, e);
        }

        writeDescriptor(rootFileSystem, metadataLocation, name, newDescriptor);

        return newDescriptor;
    }

    @Override
    public DatasetDescriptor update(String name, DatasetDescriptor descriptor) {
        Preconditions.checkArgument(name != null, "Name cannot be null");
        Preconditions.checkArgument(descriptor != null, "Descriptor cannot be null");

        logger.debug("Saving dataset metadata name:{} descriptor:{}", name, descriptor);

        writeDescriptor(rootFileSystem, pathForMetadata(name), name, descriptor);

        return descriptor;
    }

    @Override
    public boolean delete(String name) {
        Preconditions.checkArgument(name != null, "Name cannot be null");

        logger.debug("Deleting dataset metadata name:{}", name);

        final Path metadataDirectory = pathForMetadata(name);

        try {
            if (rootFileSystem.exists(metadataDirectory)) {
                if (rootFileSystem.delete(metadataDirectory, true)) {
                    return true;
                } else {
                    throw new IOException("Failed to delete metadata directory:" + metadataDirectory);
                }
            } else {
                return false;
            }
        } catch (IOException e) {
            throw new MetadataProviderException(
                    "Unable to find or delete metadata directory:" + metadataDirectory + " for dataset:" + name, e);
        }
    }

    @Override
    public boolean exists(String name) {
        Preconditions.checkArgument(name != null, "Name cannot be null");

        final Path potentialPath = pathForMetadata(name);
        try {
            return rootFileSystem.exists(potentialPath);
        } catch (IOException ex) {
            throw new MetadataProviderException("Could not check metadata path:" + potentialPath, ex);
        }
    }

    @Override
    public List<String> list() {
        List<String> datasets = Lists.newArrayList();
        try {
            FileStatus[] entries = rootFileSystem.listStatus(rootDirectory, PathFilters.notHidden());
            for (FileStatus entry : entries) {
                // assumes that all unhidden directories under the root are data sets
                if (entry.isDirectory() && rootFileSystem.exists(new Path(entry.getPath(), ".metadata"))) {
                    // may want to add a check: !RESERVED_NAMES.contains(name)
                    datasets.add(entry.getPath().getName());
                } else {
                    continue;
                }
            }
        } catch (FileNotFoundException ex) {
            // the repo hasn't created any files yet
            return datasets;
        } catch (IOException ex) {
            throw new MetadataProviderException("Could not list data sets", ex);
        }
        return datasets;
    }

    /**
     * Returns the root directory where metadata is stored.
     *
     * @return a Path where {@link DatasetDescriptor}s are stored
     *
     * @since 0.8.0
     */
    Path getRootDirectory() {
        return rootDirectory;
    }

    /**
     * Returns the file system where metadata is stored.
     *
     * @return a FileSystem
     *
     * @since 0.8.0
     */
    FileSystem getFileSytem() {
        return rootFileSystem;
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this).add("rootDirectory", rootDirectory).add("conf", conf).toString();
    }

    private Path pathForDataset(String name) {
        Preconditions.checkState(rootDirectory != null, "Dataset repository root directory can not be null");

        return rootFileSystem.makeQualified(pathForDataset(rootDirectory, name));
    }

    private Path pathForMetadata(String name) {
        Preconditions.checkState(rootDirectory != null, "Dataset repository root directory can not be null");

        return pathForMetadata(rootDirectory, name);
    }

    /**
     * Writes the contents of a {@code Descriptor} to files.
     *
     * @param fs                The {@link FileSystem} where data will be stored
     * @param metadataLocation  The directory {@link Path} where metadata files
     *                          will be located
     * @param name              The {@link Dataset} name
     * @param descriptor        The {@code Descriptor} contents to write
     *
     * @throws MetadataProviderException  If the {@code metadataLocation} does not
     *                                    exist or if any IOExceptions need to be
     *                                    propagated.
     */
    private static void writeDescriptor(FileSystem fs, Path metadataLocation, String name,
            DatasetDescriptor descriptor) {

        checkExists(fs, metadataLocation);

        FSDataOutputStream outputStream = null;
        final Path schemaPath = new Path(metadataLocation, SCHEMA_FILE_NAME);
        boolean threw = true;
        try {
            outputStream = fs.create(schemaPath, true /* overwrite */ );
            outputStream.write(descriptor.getSchema().toString(true).getBytes(Charsets.UTF_8));
            outputStream.flush();
            threw = false;
        } catch (IOException e) {
            throw new MetadataProviderException("Unable to save schema file:" + schemaPath + " for dataset:" + name,
                    e);
        } finally {
            try {
                Closeables.close(outputStream, threw);
            } catch (IOException e) {
                throw new MetadataProviderException(e);
            }
        }

        Properties properties = new Properties();
        properties.setProperty(VERSION_FIELD_NAME, METADATA_VERSION);
        properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName());

        final URI dataLocation = descriptor.getLocation();
        if (dataLocation != null) {
            properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString());
        }

        if (descriptor.isPartitioned()) {
            properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME,
                    Accessor.getDefault().toExpression(descriptor.getPartitionStrategy()));
        }

        // copy custom properties to the table
        for (String property : descriptor.listProperties()) {
            // no need to check the reserved list, those are not set on descriptors
            properties.setProperty(property, descriptor.getProperty(property));
        }

        final Path descriptorPath = new Path(metadataLocation, DESCRIPTOR_FILE_NAME);
        threw = true;
        try {
            outputStream = fs.create(descriptorPath, true /* overwrite */ );
            properties.store(outputStream, "Dataset descriptor for " + name);
            outputStream.flush();
            threw = false;
        } catch (IOException e) {
            throw new MetadataProviderException(
                    "Unable to save descriptor file:" + descriptorPath + " for dataset:" + name, e);
        } finally {
            try {
                Closeables.close(outputStream, threw);
            } catch (IOException e) {
                throw new MetadataProviderException(e);
            }
        }
    }

    /**
     * Returns the correct metadata path for the given dataset.
     * @param root A Path
     * @param name A String dataset name
     * @return the metadata Path
     */
    private static Path pathForMetadata(Path root, String name) {
        return new Path(pathForDataset(root, name), METADATA_DIRECTORY);
    }

    /**
     * Returns the correct dataset path for the given name and root directory.
     *
     * @param root A Path
     * @param name A String dataset name
     * @return the correct dataset Path
     */
    private static Path pathForDataset(Path root, String name) {
        Preconditions.checkArgument(name != null, "Dataset name cannot be null");

        // Why replace '.' here? Is this a namespacing hack?
        return new Path(root, name.replace('.', Path.SEPARATOR_CHAR));
    }

    /**
     * Precondition-style static validation that a dataset exists
     *
     * @param fs        A FileSystem where the metadata should be stored
     * @param location  The Path where the metadata should be stored
     * @throws com.cloudera.cdk.data.NoSuchDatasetException if the descriptor location is missing
     * @throws MetadataProviderException  if any IOException is thrown
     */
    @SuppressWarnings("deprecation")
    private static void checkExists(FileSystem fs, Path location) {
        try {
            if (!fs.exists(location)) {
                throw new com.cloudera.cdk.data.NoSuchDatasetException(
                        "Descriptor location is missing: " + location);
            }
        } catch (IOException ex) {
            throw new MetadataProviderException("Cannot access descriptor location", ex);
        }
    }

    /**
     * A fluent builder to aid in the construction of {@link FileSystemMetadataProvider}
     * instances.
     * @since 0.8.0
     */
    public static class Builder implements Supplier<FileSystemMetadataProvider> {

        private Path rootDirectory;
        private Configuration configuration;

        /**
         * The root directory for metadata files.
         *
         * @param path a Path to a FileSystem location
         * @return this Builder for method chaining.
         */
        public Builder rootDirectory(Path path) {
            this.rootDirectory = path;
            return this;
        }

        /**
         * The {@link Configuration} used to find the {@link FileSystem}.
         */
        public Builder configuration(Configuration configuration) {
            this.configuration = configuration;
            return this;
        }

        /**
         * @deprecated will be removed in 0.11.0
         */
        @Override
        @Deprecated
        public FileSystemMetadataProvider get() {
            return build();
        }

        /**
         * Build an instance of the configured {@link FileSystemMetadataProvider}.
         *
         * @since 0.9.0
         */
        @SuppressWarnings("deprecation")
        public FileSystemMetadataProvider build() {
            return new FileSystemMetadataProvider(configuration, rootDirectory);
        }
    }

}