com.cloudera.cdk.data.filesystem.FileSystemDataset.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.data.filesystem.FileSystemDataset.java

Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.Dataset;
import com.cloudera.cdk.data.DatasetDescriptor;
import com.cloudera.cdk.data.DatasetException;
import com.cloudera.cdk.data.DatasetReader;
import com.cloudera.cdk.data.DatasetWriter;
import com.cloudera.cdk.data.FieldPartitioner;
import com.cloudera.cdk.data.spi.Marker;
import com.cloudera.cdk.data.PartitionKey;
import com.cloudera.cdk.data.PartitionStrategy;
import com.cloudera.cdk.data.View;
import com.cloudera.cdk.data.impl.Accessor;
import com.cloudera.cdk.data.spi.AbstractDataset;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.io.IOException;
import java.util.List;

class FileSystemDataset<E> extends AbstractDataset<E> {

    private static final Logger logger = LoggerFactory.getLogger(FileSystemDataset.class);

    private final FileSystem fileSystem;
    private final Path directory;
    private final String name;
    private final DatasetDescriptor descriptor;
    private PartitionKey partitionKey;

    private final PartitionStrategy partitionStrategy;

    private final FileSystemView<E> unbounded;

    // reusable path converter, has no relevant state
    private final PathConversion convert;

    FileSystemDataset(FileSystem fileSystem, Path directory, String name, DatasetDescriptor descriptor) {

        this.fileSystem = fileSystem;
        this.directory = directory;
        this.name = name;
        this.descriptor = descriptor;
        this.partitionStrategy = descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
        this.convert = new PathConversion();

        this.unbounded = new FileSystemView<E>(this);
        // remove this.partitionKey for 0.11.0
        this.partitionKey = null;
    }

    /**
     * @deprecated will be removed in 0.11.0
     */
    @Deprecated
    FileSystemDataset(FileSystem fileSystem, Path directory, String name, DatasetDescriptor descriptor,
            @Nullable PartitionKey partitionKey) {
        this(fileSystem, directory, name, descriptor);
        this.partitionKey = partitionKey;
    }

    @Override
    public String getName() {
        return name;
    }

    @Override
    public DatasetDescriptor getDescriptor() {
        return descriptor;
    }

    /**
     * @deprecated will be removed in 0.11.0
     */
    @Deprecated
    PartitionKey getPartitionKey() {
        return partitionKey;
    }

    FileSystem getFileSystem() {
        return fileSystem;
    }

    Path getDirectory() {
        return directory;
    }

    @Override
    public DatasetWriter<E> newWriter() {
        logger.debug("Getting writer to dataset:{}", this);

        return unbounded.newWriter();
    }

    @Override
    public DatasetReader<E> newReader() {
        logger.debug("Getting reader for dataset:{}", this);

        return unbounded.newReader();
    }

    @Override
    public boolean deleteAll() {
        return unbounded.deleteAll();
    }

    @Override
    public Iterable<View<E>> getCoveringPartitions() {
        Preconditions.checkState(descriptor.isPartitioned(),
                "Attempt to get partitions on a non-partitioned dataset (name:%s)", name);

        return unbounded.getCoveringPartitions();
    }

    PathIterator pathIterator() {
        return unbounded.pathIterator();
    }

    @Override
    public FileSystemView<E> from(Marker start) {
        return (FileSystemView<E>) unbounded.from(start);
    }

    @Override
    public FileSystemView<E> fromAfter(Marker start) {
        return (FileSystemView<E>) unbounded.fromAfter(start);
    }

    @Override
    public FileSystemView<E> to(Marker end) {
        return (FileSystemView<E>) unbounded.to(end);
    }

    @Override
    public FileSystemView<E> toBefore(Marker end) {
        return (FileSystemView<E>) unbounded.toBefore(end);
    }

    @Override
    public FileSystemView<E> of(Marker partial) {
        return (FileSystemView<E>) unbounded.of(partial);
    }

    @Override
    @Nullable
    @Deprecated
    public Dataset<E> getPartition(PartitionKey key, boolean allowCreate) {
        Preconditions.checkState(descriptor.isPartitioned(),
                "Attempt to get a partition on a non-partitioned dataset (name:%s)", name);

        logger.debug("Loading partition for key {}, allowCreate:{}", new Object[] { key, allowCreate });

        Path partitionDirectory = fileSystem.makeQualified(toDirectoryName(directory, key));

        try {
            if (!fileSystem.exists(partitionDirectory)) {
                if (allowCreate) {
                    fileSystem.mkdirs(partitionDirectory);
                } else {
                    return null;
                }
            }
        } catch (IOException e) {
            throw new DatasetException(
                    "Unable to locate or create dataset partition directory " + partitionDirectory, e);
        }

        int partitionDepth = key.getLength();
        PartitionStrategy subpartitionStrategy = Accessor.getDefault().getSubpartitionStrategy(partitionStrategy,
                partitionDepth);

        return new FileSystemDataset.Builder()
                .name(name).fileSystem(fileSystem).descriptor(new DatasetDescriptor.Builder(descriptor)
                        .location(partitionDirectory).partitionStrategy(subpartitionStrategy).build())
                .partitionKey(key).build();
    }

    @Override
    @Deprecated
    public void dropPartition(PartitionKey key) {
        Preconditions.checkState(descriptor.isPartitioned(),
                "Attempt to drop a partition on a non-partitioned dataset (name:%s)", name);
        Preconditions.checkArgument(key != null, "Partition key may not be null");

        logger.debug("Dropping partition with key:{} dataset:{}", key, name);

        Path partitionDirectory = toDirectoryName(directory, key);

        try {
            if (!fileSystem.delete(partitionDirectory, true)) {
                throw new DatasetException(
                        "Partition directory " + partitionDirectory + " for key " + key + " does not exist");
            }
        } catch (IOException e) {
            throw new DatasetException("Unable to locate or drop dataset partition directory " + partitionDirectory,
                    e);
        }
    }

    @Override
    @Deprecated
    public Iterable<Dataset<E>> getPartitions() {
        Preconditions.checkState(descriptor.isPartitioned(),
                "Attempt to get partitions on a non-partitioned dataset (name:%s)", name);

        List<Dataset<E>> partitions = Lists.newArrayList();

        FileStatus[] fileStatuses;

        try {
            fileStatuses = fileSystem.listStatus(directory, PathFilters.notHidden());
        } catch (IOException e) {
            throw new DatasetException("Unable to list partition directory for directory " + directory, e);
        }

        for (FileStatus stat : fileStatuses) {
            Path p = fileSystem.makeQualified(stat.getPath());
            PartitionKey key = fromDirectoryName(p);
            PartitionStrategy subPartitionStrategy = Accessor.getDefault()
                    .getSubpartitionStrategy(partitionStrategy, 1);
            Builder builder = new FileSystemDataset.Builder().name(name).fileSystem(fileSystem)
                    .descriptor(new DatasetDescriptor.Builder(descriptor).location(p)
                            .partitionStrategy(subPartitionStrategy).build())
                    .partitionKey(key);

            partitions.add(builder.<E>build());
        }

        return partitions;
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this).add("name", name).add("descriptor", descriptor)
                .add("directory", directory).add("dataDirectory", directory).add("partitionKey", partitionKey)
                .toString();
    }

    @Deprecated
    void accumulateDatafilePaths(Path directory, List<Path> paths) throws IOException {

        for (FileStatus status : fileSystem.listStatus(directory, PathFilters.notHidden())) {

            if (status.isDirectory()) {
                accumulateDatafilePaths(status.getPath(), paths);
            } else {
                paths.add(status.getPath());
            }
        }
    }

    @SuppressWarnings("unchecked")
    private Path toDirectoryName(Path dir, PartitionKey key) {
        Path result = dir;
        for (int i = 0; i < key.getLength(); i++) {
            final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(i);
            result = new Path(result, convert.dirnameForValue(fp, key.get(i)));
        }
        return result;
    }

    @SuppressWarnings("unchecked")
    private PartitionKey fromDirectoryName(Path dir) {
        final FieldPartitioner fp = partitionStrategy.getFieldPartitioners().get(0);
        final List<Object> values = Lists.newArrayList();

        if (partitionKey != null) {
            values.addAll(partitionKey.getValues());
        }

        values.add(convert.valueForDirname(fp, dir.getName()));

        return Accessor.getDefault().newPartitionKey(values.toArray());
    }

    public static class Builder {

        private Configuration conf;
        private FileSystem fileSystem;
        private Path directory;
        private String name;
        private DatasetDescriptor descriptor;
        private PartitionKey partitionKey;

        public Builder name(String name) {
            this.name = name;
            return this;
        }

        protected Builder fileSystem(FileSystem fs) {
            this.fileSystem = fs;
            return this;
        }

        public Builder configuration(Configuration conf) {
            this.conf = conf;
            return this;
        }

        public Builder descriptor(DatasetDescriptor descriptor) {
            Preconditions.checkArgument(descriptor.getLocation() != null, "Dataset location cannot be null");

            this.descriptor = descriptor;

            return this;
        }

        Builder partitionKey(@Nullable PartitionKey partitionKey) {
            this.partitionKey = partitionKey;
            return this;
        }

        public <E> FileSystemDataset<E> build() {
            Preconditions.checkState(this.name != null, "No dataset name defined");
            Preconditions.checkState(this.descriptor != null, "No dataset descriptor defined");
            Preconditions.checkState((conf != null) || (fileSystem != null),
                    "Configuration or FileSystem must be set");

            this.directory = new Path(descriptor.getLocation());

            if (fileSystem == null) {
                try {
                    this.fileSystem = directory.getFileSystem(conf);
                } catch (IOException ex) {
                    throw new DatasetException("Cannot access FileSystem", ex);
                }
            }

            Path absoluteDirectory = fileSystem.makeQualified(directory);
            return new FileSystemDataset<E>(fileSystem, absoluteDirectory, name, descriptor, partitionKey);
        }
    }

}