com.cloudera.cdk.data.filesystem.FileSystemDatasetWriter.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.data.filesystem.FileSystemDatasetWriter.java

Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.data.filesystem;

import com.cloudera.cdk.data.spi.ReaderWriterState;
import com.cloudera.cdk.data.DatasetWriter;
import com.cloudera.cdk.data.DatasetWriterException;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.io.Closeables;
import org.apache.avro.Schema;
import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

class FileSystemDatasetWriter<E> implements DatasetWriter<E> {

    private static final Logger logger = LoggerFactory.getLogger(FileSystemDatasetWriter.class);

    private Path path;
    private Schema schema;
    private FileSystem fileSystem;
    private boolean enableCompression;

    private Path pathTmp;
    private FSDataOutputStream out;
    private DataFileWriter<E> dataFileWriter;
    private DatumWriter<E> writer;
    private ReaderWriterState state;

    public FileSystemDatasetWriter(FileSystem fileSystem, Path path, Schema schema, boolean enableCompression) {

        this.fileSystem = fileSystem;
        this.path = path;
        this.pathTmp = new Path(path.getParent(), "." + path.getName() + ".tmp");
        this.schema = schema;
        this.enableCompression = enableCompression;
        this.state = ReaderWriterState.NEW;
    }

    @Override
    public void open() {
        Preconditions.checkState(state.equals(ReaderWriterState.NEW), "Unable to open a writer from state:%s",
                state);

        logger.debug("Opening data file with pathTmp:{} (final path will be path:{})", pathTmp, path);

        writer = new ReflectDatumWriter<E>();
        dataFileWriter = new DataFileWriter<E>(writer);

        /*
         * We may want to expose the codec in the writer and simply rely on the
         * builder and proper instantiation from dataset-level configuration.
         * Hard-coding snappy seems a little too draconian.
         */
        if (enableCompression) {
            dataFileWriter.setCodec(CodecFactory.snappyCodec());
        }

        try {
            out = fileSystem.create(pathTmp, true);
            dataFileWriter.create(schema, out);
        } catch (IOException e) {
            throw new DatasetWriterException("Unable to create writer to path:" + pathTmp, e);
        }

        state = ReaderWriterState.OPEN;
    }

    @Override
    public void write(E entity) {
        Preconditions.checkState(state.equals(ReaderWriterState.OPEN), "Attempt to write to a writer in state:%s",
                state);

        try {
            dataFileWriter.append(entity);
        } catch (IOException e) {
            throw new DatasetWriterException("Unable to write entity:" + entity + " with writer:" + dataFileWriter,
                    e);
        }
    }

    @Override
    public void flush() {
        Preconditions.checkState(state.equals(ReaderWriterState.OPEN), "Attempt to write to a writer in state:%s",
                state);

        try {
            dataFileWriter.flush();
            out.hflush();
        } catch (IOException e) {
            throw new DatasetWriterException("Unable to flush file writer:" + dataFileWriter);
        }
    }

    @Override
    public void close() {
        if (state.equals(ReaderWriterState.OPEN)) {
            logger.debug("Closing pathTmp:{}", pathTmp);

            try {
                Closeables.close(dataFileWriter, false);
            } catch (IOException e) {
                throw new DatasetWriterException(
                        "Unable to close writer:" + dataFileWriter + " to path:" + pathTmp);
            }

            logger.debug("Committing pathTmp:{} to path:{}", pathTmp, path);

            try {
                if (!fileSystem.rename(pathTmp, path)) {
                    throw new DatasetWriterException("Failed to move " + pathTmp + " to " + path);
                }
            } catch (IOException e) {
                throw new DatasetWriterException("Internal error while trying to commit path:" + pathTmp, e);
            }

            state = ReaderWriterState.CLOSED;
        }
    }

    @Override
    public boolean isOpen() {
        return state.equals(ReaderWriterState.OPEN);
    }

    @Override
    public String toString() {
        return Objects.toStringHelper(this).add("path", path).add("schema", schema).add("fileSystem", fileSystem)
                .add("enableCompression", enableCompression).add("pathTmp", pathTmp)
                .add("dataFileWriter", dataFileWriter).add("writer", writer).add("state", state).toString();
    }

    public static class Builder {

        private FileSystem fileSystem;
        private Path path;
        private Schema schema;
        private boolean enableCompression;

        public Builder() {
            enableCompression = true;
        }

        public Builder fileSystem(FileSystem fileSystem) {
            this.fileSystem = fileSystem;
            return this;
        }

        public Builder path(Path path) {
            this.path = path;
            return this;
        }

        public Builder schema(Schema schema) {
            this.schema = schema;
            return this;
        }

        public Builder enableCompression(boolean enableCompression) {
            this.enableCompression = enableCompression;
            return this;
        }

        public <E> FileSystemDatasetWriter<E> build() {
            Preconditions.checkState(fileSystem != null, "File system is not defined");
            Preconditions.checkState(path != null, "Path is not defined");
            Preconditions.checkState(schema != null, "Schema is not defined");

            return new FileSystemDatasetWriter<E>(fileSystem, path, schema, enableCompression);
        }

    }

}