gobblin.writer.FsDataWriter.java Source code

Introduction

Here is the source code for gobblin.writer.FsDataWriter.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.writer;

import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closer;

import gobblin.codec.StreamCodec;
import gobblin.commit.SpeculativeAttemptAwareConstruct;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.metadata.types.GlobalMetadata;
import gobblin.util.FinalState;
import gobblin.util.ForkOperatorUtils;
import gobblin.util.HadoopUtils;
import gobblin.util.JobConfigurationUtils;
import gobblin.util.WriterUtils;
import gobblin.util.recordcount.IngestionRecordCountProvider;

/**
 * An implementation of {@link DataWriter} does the work of setting the output/staging dir
 * and creating the FileSystem instance.
 *
 * @author akshay@nerdwallet.com
 */
public abstract class FsDataWriter<D>
        implements DataWriter<D>, FinalState, MetadataAwareWriter, SpeculativeAttemptAwareConstruct {

    private static final Logger LOG = LoggerFactory.getLogger(FsDataWriter.class);

    public static final String WRITER_INCLUDE_RECORD_COUNT_IN_FILE_NAMES = ConfigurationKeys.WRITER_PREFIX
            + ".include.record.count.in.file.names";

    protected final State properties;
    protected final String id;
    protected final int numBranches;
    protected final int branchId;
    protected final String fileName;
    protected final FileSystem fs;
    protected final Path stagingFile;
    private final GlobalMetadata defaultMetadata;
    protected Path outputFile;
    protected final String allOutputFilesPropName;
    protected final boolean shouldIncludeRecordCountInFileName;
    protected final int bufferSize;
    protected final short replicationFactor;
    protected final long blockSize;
    protected final FsPermission filePermission;
    protected final FsPermission dirPermission;
    protected final Optional<String> group;
    protected final Closer closer = Closer.create();
    protected final Optional<String> writerAttemptIdOptional;
    protected Optional<Long> bytesWritten;
    private final List<StreamCodec> encoders;

    public FsDataWriter(FsDataWriterBuilder<?, ?> builder, State properties) throws IOException {
        this.properties = properties;
        this.id = builder.getWriterId();
        this.numBranches = builder.getBranches();
        this.branchId = builder.getBranch();
        this.fileName = builder.getFileName(properties);
        this.writerAttemptIdOptional = Optional.fromNullable(builder.getWriterAttemptId());
        this.encoders = builder.getEncoders();

        Configuration conf = new Configuration();
        // Add all job configuration properties so they are picked up by Hadoop
        JobConfigurationUtils.putStateIntoConfiguration(properties, conf);
        this.fs = WriterUtils.getWriterFS(properties, this.numBranches, this.branchId);

        // Initialize staging/output directory
        Path writerStagingDir = this.writerAttemptIdOptional.isPresent()
                ? WriterUtils.getWriterStagingDir(properties, this.numBranches, this.branchId,
                        this.writerAttemptIdOptional.get())
                : WriterUtils.getWriterStagingDir(properties, this.numBranches, this.branchId);
        this.stagingFile = new Path(writerStagingDir, this.fileName);

        this.outputFile = new Path(WriterUtils.getWriterOutputDir(properties, this.numBranches, this.branchId),
                this.fileName);
        this.allOutputFilesPropName = ForkOperatorUtils.getPropertyNameForBranch(
                ConfigurationKeys.WRITER_FINAL_OUTPUT_FILE_PATHS, this.numBranches, this.branchId);

        // Deleting the staging file if it already exists, which can happen if the
        // task failed and the staging file didn't get cleaned up for some reason.
        // Deleting the staging file prevents the task retry from being blocked.
        if (this.fs.exists(this.stagingFile)) {
            LOG.warn(String.format("Task staging file %s already exists, deleting it", this.stagingFile));
            HadoopUtils.deletePath(this.fs, this.stagingFile, false);
        }

        this.shouldIncludeRecordCountInFileName = properties.getPropAsBoolean(
                ForkOperatorUtils.getPropertyNameForBranch(WRITER_INCLUDE_RECORD_COUNT_IN_FILE_NAMES,
                        this.numBranches, this.branchId),
                false);

        this.bufferSize = properties.getPropAsInt(ForkOperatorUtils
                .getPropertyNameForBranch(ConfigurationKeys.WRITER_BUFFER_SIZE, this.numBranches, this.branchId),
                ConfigurationKeys.DEFAULT_BUFFER_SIZE);

        this.replicationFactor = properties.getPropAsShort(
                ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_REPLICATION_FACTOR,
                        this.numBranches, this.branchId),
                this.fs.getDefaultReplication(this.outputFile));

        this.blockSize = properties
                .getPropAsLong(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_BLOCK_SIZE,
                        this.numBranches, this.branchId), this.fs.getDefaultBlockSize(this.outputFile));

        this.filePermission = HadoopUtils.deserializeWriterFilePermissions(properties, this.numBranches,
                this.branchId);

        this.dirPermission = HadoopUtils.deserializeWriterDirPermissions(properties, this.numBranches,
                this.branchId);

        this.group = Optional.fromNullable(properties.getProp(ForkOperatorUtils
                .getPropertyNameForBranch(ConfigurationKeys.WRITER_GROUP_NAME, this.numBranches, this.branchId)));

        // Create the parent directory of the output file if it does not exist
        WriterUtils.mkdirsWithRecursivePermission(this.fs, this.outputFile.getParent(), this.dirPermission);
        this.bytesWritten = Optional.absent();

        this.defaultMetadata = new GlobalMetadata();
        for (StreamCodec c : getEncoders()) {
            this.defaultMetadata.addTransferEncoding(c.getTag());
        }

        String partitionPath = builder.getPartitionPath(properties);
        if (builder.getPartitionPath(properties) != null) {
            properties.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + builder.getWriterId(), partitionPath);
        }
    }

    /**
     * Create the staging output file and an {@link OutputStream} to write to the file.
     *
     * @return an {@link OutputStream} to write to the staging file
     * @throws IOException if it fails to create the file and the {@link OutputStream}
     */
    protected OutputStream createStagingFileOutputStream() throws IOException {
        OutputStream out = this.fs.create(this.stagingFile, this.filePermission, true, this.bufferSize,
                this.replicationFactor, this.blockSize, null);

        // encoders need to be attached to the stream in reverse order since we should write to the
        // innermost encoder first
        for (StreamCodec encoder : Lists.reverse(getEncoders())) {
            out = encoder.encodeOutputStream(out);
        }

        return this.closer.register(out);
    }

    /**
     * Set the group name of the staging output file.
     *
     * @throws IOException if it fails to set the group name
     */
    protected void setStagingFileGroup() throws IOException {
        Preconditions.checkArgument(this.fs.exists(this.stagingFile),
                String.format("Staging output file %s does not exist", this.stagingFile));
        if (this.group.isPresent()) {
            HadoopUtils.setGroup(this.fs, this.stagingFile, this.group.get());
        }
    }

    protected List<StreamCodec> getEncoders() {
        return encoders;
    }

    public GlobalMetadata getDefaultMetadata() {
        return defaultMetadata;
    }

    @Override
    public long bytesWritten() throws IOException {
        if (this.bytesWritten.isPresent()) {
            return this.bytesWritten.get().longValue();
        }
        return 0l;
    }

    /**
     * {@inheritDoc}.
     *
     * <p>
     *   This default implementation simply renames the staging file to the output file. If the output file
     *   already exists, it will delete it first before doing the renaming.
     * </p>
     *
     * @throws IOException if any file operation fails
     */
    @Override
    public void commit() throws IOException {
        this.closer.close();

        setStagingFileGroup();

        if (!this.fs.exists(this.stagingFile)) {
            throw new IOException(String.format("File %s does not exist", this.stagingFile));
        }

        FileStatus stagingFileStatus = this.fs.getFileStatus(this.stagingFile);

        // Double check permission of staging file
        if (!stagingFileStatus.getPermission().equals(this.filePermission)) {
            this.fs.setPermission(this.stagingFile, this.filePermission);
        }

        this.bytesWritten = Optional.of(Long.valueOf(stagingFileStatus.getLen()));

        LOG.info(String.format("Moving data from %s to %s", this.stagingFile, this.outputFile));
        // For the same reason as deleting the staging file if it already exists, deleting
        // the output file if it already exists prevents task retry from being blocked.
        if (this.fs.exists(this.outputFile)) {
            LOG.warn(String.format("Task output file %s already exists", this.outputFile));
            HadoopUtils.deletePath(this.fs, this.outputFile, false);
        }

        HadoopUtils.renamePath(this.fs, this.stagingFile, this.outputFile);

    }

    /**
     * {@inheritDoc}.
     *
     * <p>
     *   This default implementation simply deletes the staging file if it exists.
     * </p>
     *
     * @throws IOException if deletion of the staging file fails
     */
    @Override
    public void cleanup() throws IOException {
        // Delete the staging file
        if (this.fs.exists(this.stagingFile)) {
            HadoopUtils.deletePath(this.fs, this.stagingFile, false);
        }
    }

    @Override
    public void close() throws IOException {
        this.closer.close();

        if (this.shouldIncludeRecordCountInFileName) {
            String filePathWithRecordCount = addRecordCountToFileName();
            this.properties.appendToSetProp(this.allOutputFilesPropName, filePathWithRecordCount);
        } else {
            this.properties.appendToSetProp(this.allOutputFilesPropName, getOutputFilePath());
        }
    }

    private synchronized String addRecordCountToFileName() throws IOException {
        String filePath = getOutputFilePath();
        String filePathWithRecordCount = IngestionRecordCountProvider.constructFilePath(filePath, recordsWritten());
        LOG.info("Renaming " + filePath + " to " + filePathWithRecordCount);
        HadoopUtils.renamePath(this.fs, new Path(filePath), new Path(filePathWithRecordCount));
        this.outputFile = new Path(filePathWithRecordCount);
        return filePathWithRecordCount;
    }

    @Override
    public State getFinalState() {
        State state = new State();

        state.setProp("RecordsWritten", recordsWritten());
        try {
            state.setProp("BytesWritten", bytesWritten());
        } catch (Exception exception) {
            // If Writer fails to return bytesWritten, it might not be implemented, or implemented incorrectly.
            // Omit property instead of failing.
        }

        return state;
    }

    /**
     * Get the output file path.
     *
     * @return the output file path
     */
    public String getOutputFilePath() {
        return this.outputFile.toString();
    }

    /**
     * Get the fully-qualified output file path.
     *
     * @return the fully-qualified output file path
     */
    public String getFullyQualifiedOutputFilePath() {
        return this.fs.makeQualified(this.outputFile).toString();
    }

    @Override
    public boolean isSpeculativeAttemptSafe() {
        return this.writerAttemptIdOptional.isPresent() && this.getClass() == FsDataWriter.class;
    }
}