org.apache.flink.streaming.connectors.fs.RollingSink.java Source code

Introduction

Here is the source code for org.apache.flink.streaming.connectors.fs.RollingSink.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.streaming.connectors.fs;

import org.apache.commons.lang3.time.StopWatch;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.InputTypeConfigurable;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.CheckpointListener;
import org.apache.flink.streaming.api.checkpoint.Checkpointed;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;

/**
 * Sink that emits its input elements to rolling {@link org.apache.hadoop.fs.FileSystem} files. This
 * is itegrated with the checkpointing mechanism to provide exactly once semantics.
 *
 * <p>
 * When creating the sink a {@code basePath} must be specified. The base directory contains
 * one directory for every bucket. The bucket directories themselves contain several part files.
 * These contain the actual written data.
 *
 * <p>
 * The sink uses a {@link Bucketer} to determine the name of bucket directories inside the
 * base directory. Whenever the {@code Bucketer} returns a different directory name than
 * it returned before the sink will close the current part files inside that bucket
 * and start the new bucket directory. The default bucketer is a {@link DateTimeBucketer} with
 * date format string {@code ""yyyy-MM-dd--HH"}. You can specify a custom {@code Bucketer}
 * using {@link #setBucketer(Bucketer)}. For example, use
 * {@link org.apache.flink.streaming.connectors.fs.NonRollingBucketer} if you don't want to have
 * buckets but still write part files in a fault-tolerant way.
 *
 * <p>
 * The filenames of the part files contain the part prefix, the parallel subtask index of the sink
 * and a rolling counter, for example {@code "part-1-17"}. Per default the part prefix is
 * {@code "part"} but this can be
 * configured using {@link #setPartPrefix(String)}. When a part file becomes bigger
 * than the batch size the current part file is closed, the part counter is increased and
 * a new part file is created. The batch size defaults to {@code 384MB}, this can be configured
 * using {@link #setBatchSize(long)}.
 *
 * <p>
 * Part files can be in one of three states: in-progress, pending or finished. The reason for this
 * is how the sink works together with the checkpointing mechanism to provide exactly-once semantics
 * and fault-tolerance. The part file that is currently being written to is in-progress. Once
 * a part file is closed for writing it becomes pending. When a checkpoint is successful the
 * currently pending files will be moved to finished. If a failure occurs the pending files
 * will be deleted to reset state to the last checkpoint. The data in in-progress files will
 * also have to be rolled back. If the {@code FileSystem} supports the {@code truncate} call
 * this will be used to reset the file back to a previous state. If not, a special file
 * with the same name as the part file and the suffix {@code ".valid-length"} will be written
 * that contains the length up to which the file contains valid data. When reading the file
 * it must be ensured that it is only read up to that point. The prefixes and suffixes for
 * the different file states and valid-length files can be configured, for example with
 * {@link #setPendingSuffix(String)}.
 *
 * <p>
 * Note: If checkpointing is not enabled the pending files will never be moved to the finished state.
 * In that case, the pending suffix/prefix can be set to {@code ""} to make the sink work
 * in a non-fault-tolerant way but still provide output without prefixes and suffixes.
 *
 * <p>
 * The part files are written using an instance of {@link Writer}. By default
 * {@link org.apache.flink.streaming.connectors.fs.StringWriter} is used, which writes the result
 * of {@code toString()} for every element. Separated by newlines. You can configure the writer
 * using {@link #setWriter(Writer)}. For example,
 * {@link org.apache.flink.streaming.connectors.fs.SequenceFileWriter} can be used to write
 * Hadoop {@code SequenceFiles}.
 *
 * <p>
 * Example:
 *
 * <pre>{@code
 *     new RollingSink<Tuple2<IntWritable, Text>>(outPath)
 *         .setWriter(new SequenceFileWriter<IntWritable, Text>())
 *         .setBucketer(new DateTimeBucketer("yyyy-MM-dd--HHmm")
 * }</pre>
 *
 * This will create a sink that writes to {@code SequenceFiles} and rolls every minute.
 *
 * @see org.apache.flink.streaming.connectors.fs.DateTimeBucketer
 * @see StringWriter
 * @see SequenceFileWriter
 *
 * @param <T> Type of the elements emitted by this sink
 */
public class RollingSink<T> extends RichSinkFunction<T>
        implements InputTypeConfigurable, Checkpointed<RollingSink.BucketState>, CheckpointListener {
    private static final long serialVersionUID = 1L;

    private static Logger LOG = LoggerFactory.getLogger(RollingSink.class);

    // --------------------------------------------------------------------------------------------
    //  User configuration values
    // --------------------------------------------------------------------------------------------
    // These are initialized with some defaults but are meant to be changeable by the user

    /**
     * The default maximum size of part files.
     *
     * 6 times the default block size
     */
    private final long DEFAULT_BATCH_SIZE = 1024L * 1024L * 384L;

    /**
     * This is used for part files that we are writing to but which where not yet confirmed
     * by a checkpoint.
     */
    private final String DEFAULT_IN_PROGRESS_SUFFIX = ".in-progress";

    /**
     * See above, but for prefix
     */
    private final String DEFAULT_IN_PROGRESS_PREFIX = "_";

    /**
     * This is used for part files that we are not writing to but which are not yet confirmed by
     * checkpoint.
     */
    private final String DEFAULT_PENDING_SUFFIX = ".pending";

    /**
     * See above, but for prefix.
     */
    private final String DEFAULT_PENDING_PREFIX = "_";

    /**
     * When truncate() is not supported on the used FileSystem we instead write a
     * file along the part file with this ending that contains the length up to which
     * the part file is valid.
     */
    private final String DEFAULT_VALID_SUFFIX = ".valid-length";

    /**
     * See above, but for prefix.
     */
    private final String DEFAULT_VALID_PREFIX = "_";

    /**
     * The default prefix for part files.
     */
    private final String DEFAULT_PART_REFIX = "part";

    /**
     * The default timeout for asynchronous operations such as recoverLease and truncate. In
     * milliseconds.
     */
    private final long DEFAULT_ASYNC_TIMEOUT_MS = 60 * 1000;

    /**
     * The base {@code Path} that stored all rolling bucket directories.
     */
    private final String basePath;

    /**
     * The {@code Bucketer} that is used to determine the path of bucket directories.
     */
    private Bucketer bucketer;

    /**
     * We have a template and call duplicate() for each parallel writer in open() to get the actual
     * writer that is used for the part files.
     */
    private Writer<T> writerTemplate;

    /**
     * The actual writer that we user for writing the part files.
     */
    private Writer<T> writer;

    /**
     * Maximum size of part files. If files exceed this we close and create a new one in the same
     * bucket directory.
     */
    private long batchSize;

    /**
     * If this is true we remove any leftover in-progress/pending files when the sink is opened.
     *
     * <p>
     * This should only be set to false if using the sink without checkpoints, to not remove
     * the files already in the directory.
     */
    private boolean cleanupOnOpen = true;

    // These are the actually configured prefixes/suffixes
    private String inProgressSuffix = DEFAULT_IN_PROGRESS_SUFFIX;
    private String inProgressPrefix = DEFAULT_IN_PROGRESS_PREFIX;

    private String pendingSuffix = DEFAULT_PENDING_SUFFIX;
    private String pendingPrefix = DEFAULT_PENDING_PREFIX;

    private String validLengthSuffix = DEFAULT_VALID_SUFFIX;
    private String validLengthPrefix = DEFAULT_VALID_PREFIX;

    private String partPrefix = DEFAULT_PART_REFIX;

    /**
     * The timeout for asynchronous operations such as recoverLease and truncate. In
     * milliseconds.
     */
    private long asyncTimeout = DEFAULT_ASYNC_TIMEOUT_MS;

    // --------------------------------------------------------------------------------------------
    //  Internal fields (not configurable by user)
    // --------------------------------------------------------------------------------------------

    /**
     * The part file that we are currently writing to.
     */
    private transient Path currentPartPath;

    /**
     * The bucket directory that we are currently filling.
     */
    private transient Path currentBucketDirectory;

    /**
     * Our subtask index, retrieved from the {@code RuntimeContext} in {@link #open}.
     */
    private transient int subtaskIndex;

    /**
     * For counting the part files inside a bucket directory. Part files follow the patter
     * {@code "{part-prefix}-{subtask}-{count}"}. When creating new part files we increase the counter.
     */
    private transient int partCounter;

    /**
     * Tracks if the writer is currently opened or closed.
     */
    private transient boolean isWriterOpen = false;

    /**
     * We use reflection to get the .truncate() method, this is only available starting with
     * Hadoop 2.7
     */
    private transient Method refTruncate;

    /**
     * The state object that is handled by flink from snapshot/restore. In there we store the
     * current part file path, the valid length of the in-progress files and pending part files.
     */
    private transient BucketState bucketState;

    /**
     * Creates a new {@code RollingSink} that writes files to the given base directory.
     *
     * <p>
     * This uses a{@link DateTimeBucketer} as bucketer and a {@link StringWriter} has writer.
     * The maximum bucket size is set to 384 MB.
     *
     * @param basePath The directory to which to write the bucket files.
     */
    public RollingSink(String basePath) {
        this.basePath = basePath;
        this.bucketer = new DateTimeBucketer();
        this.batchSize = DEFAULT_BATCH_SIZE;
        this.writerTemplate = new StringWriter<>();
    }

    @Override
    @SuppressWarnings("unchecked")
    public void setInputType(TypeInformation<?> type, ExecutionConfig executionConfig) {
        if (this.writerTemplate instanceof InputTypeConfigurable) {
            ((InputTypeConfigurable) writerTemplate).setInputType(type, executionConfig);
        }
    }

    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);

        subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
        partCounter = 0;

        this.writer = writerTemplate.duplicate();

        if (bucketState == null) {
            bucketState = new BucketState();
        }

        FileSystem fs = new Path(basePath).getFileSystem(new org.apache.hadoop.conf.Configuration());
        refTruncate = reflectTruncate(fs);

        // delete pending/in-progress files that might be left if we fail while
        // no checkpoint has yet been done
        try {
            if (fs.exists(new Path(basePath)) && cleanupOnOpen) {
                RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true);

                while (bucketFiles.hasNext()) {
                    LocatedFileStatus file = bucketFiles.next();
                    if (file.getPath().toString().endsWith(pendingSuffix)) {
                        // only delete files that contain our subtask index
                        if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                            LOG.debug("(OPEN) Deleting leftover pending file {}", file.getPath().toString());
                            fs.delete(file.getPath(), true);
                        }
                    }
                    if (file.getPath().toString().endsWith(inProgressSuffix)) {
                        // only delete files that contain our subtask index
                        if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                            LOG.debug("(OPEN) Deleting leftover in-progress file {}", file.getPath().toString());
                            fs.delete(file.getPath(), true);
                        }
                    }
                }
            }
        } catch (IOException e) {
            LOG.error("Error while deleting leftover pending/in-progress files: {}", e);
            throw new RuntimeException("Error while deleting leftover pending/in-progress files.", e);
        }
    }

    @Override
    public void close() throws Exception {
        //      boolean interrupted = Thread.interrupted();
        closeCurrentPartFile();

        //      if (interrupted) {
        //         Thread.currentThread().interrupt();
        //      }
    }

    @Override
    public void invoke(T value) throws Exception {

        if (shouldRoll()) {
            openNewPartFile();
        }

        writer.write(value);
    }

    /**
     * Determines whether we should change the bucket file we are writing to.
     *
     * <p>
     * This will roll if no file was created yet, if the file size is larger than the specified size
     * or if the {@code Bucketer} determines that we should roll.
     */
    private boolean shouldRoll() throws IOException {
        boolean shouldRoll = false;
        if (!isWriterOpen) {
            shouldRoll = true;
            LOG.debug("RollingSink {} starting new initial bucket. ", subtaskIndex);
        }
        if (bucketer.shouldStartNewBucket(new Path(basePath), currentBucketDirectory)) {
            shouldRoll = true;
            LOG.debug("RollingSink {} starting new bucket because {} said we should. ", subtaskIndex, bucketer);
            // we will retrieve a new bucket base path in openNewPartFile so reset the part counter
            partCounter = 0;
        }
        if (isWriterOpen) {
            long writePosition = writer.getPos();
            if (isWriterOpen && writePosition > batchSize) {
                shouldRoll = true;
                LOG.debug("RollingSink {} starting new bucket because file position {} is above batch size {}.",
                        subtaskIndex, writePosition, batchSize);
            }
        }
        return shouldRoll;
    }

    /**
     * Opens a new part file.
     *
     * <p>
     * This closes the old bucket file and retrieves a new bucket path from the {@code Bucketer}.
     */
    private void openNewPartFile() throws Exception {
        closeCurrentPartFile();

        org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();

        FileSystem fs = new Path(basePath).getFileSystem(conf);

        Path newBucketDirectory = bucketer.getNextBucketPath(new Path(basePath));

        if (!newBucketDirectory.equals(currentBucketDirectory)) {
            currentBucketDirectory = newBucketDirectory;
            try {
                if (fs.mkdirs(currentBucketDirectory)) {
                    LOG.debug("Created new bucket directory: {}", currentBucketDirectory);
                }
            } catch (IOException e) {
                throw new RuntimeException("Could not create base path for new rolling file.", e);
            }
        }

        currentPartPath = new Path(currentBucketDirectory, partPrefix + "-" + subtaskIndex + "-" + partCounter);

        // This should work since there is only one parallel subtask that tries names with
        // our subtask id. Otherwise we would run into concurrency issues here.
        while (fs.exists(currentPartPath)
                || fs.exists(new Path(currentPartPath.getParent(), pendingPrefix + currentPartPath.getName())
                        .suffix(pendingSuffix))) {
            partCounter++;
            currentPartPath = new Path(currentBucketDirectory, partPrefix + "-" + subtaskIndex + "-" + partCounter);
        }

        // increase, so we don't have to check for this name next time
        partCounter++;

        LOG.debug("Next part path is {}", currentPartPath.toString());

        Path inProgressPath = new Path(currentPartPath.getParent(), inProgressPrefix + currentPartPath.getName())
                .suffix(inProgressSuffix);

        writer.open(fs, inProgressPath);
        isWriterOpen = true;
    }

    /**
     * Closes the current part file.
     *
     * <p>
     * This moves the current in-progress part file to a pending file and adds it to the list
     * of pending files in our bucket state.
     */
    private void closeCurrentPartFile() throws Exception {
        if (isWriterOpen) {
            writer.close();
            isWriterOpen = false;
        }

        if (currentPartPath != null) {
            Path inProgressPath = new Path(currentPartPath.getParent(),
                    inProgressPrefix + currentPartPath.getName()).suffix(inProgressSuffix);
            Path pendingPath = new Path(currentPartPath.getParent(), pendingPrefix + currentPartPath.getName())
                    .suffix(pendingSuffix);
            FileSystem fs = inProgressPath.getFileSystem(new org.apache.hadoop.conf.Configuration());
            fs.rename(inProgressPath, pendingPath);
            LOG.debug("Moving in-progress bucket {} to pending file {}", inProgressPath, pendingPath);
            this.bucketState.pendingFiles.add(currentPartPath.toString());
        }
    }

    /**
     * Gets the truncate() call using reflection.
     *
     * <p>
     * Note: This code comes from Flume
     */
    private Method reflectTruncate(FileSystem fs) {
        Method m = null;
        if (fs != null) {
            Class<?> fsClass = fs.getClass();
            try {
                m = fsClass.getMethod("truncate", Path.class, long.class);
            } catch (NoSuchMethodException ex) {
                LOG.debug(
                        "Truncate not found. Will write a file with suffix '{}' "
                                + " and prefix '{}' to specify how many bytes in a bucket are valid.",
                        validLengthSuffix, validLengthPrefix);
                return null;
            }

            // verify that truncate actually works
            FSDataOutputStream outputStream;
            Path testPath = new Path(UUID.randomUUID().toString());
            try {
                outputStream = fs.create(testPath);
                outputStream.writeUTF("hello");
                outputStream.close();
            } catch (IOException e) {
                LOG.error("Could not create file for checking if truncate works.", e);
                throw new RuntimeException("Could not create file for checking if truncate works.", e);
            }

            try {
                m.invoke(fs, testPath, 2);
            } catch (IllegalAccessException | InvocationTargetException e) {
                LOG.debug("Truncate is not supported.", e);
                m = null;
            }

            try {
                fs.delete(testPath, false);
            } catch (IOException e) {
                LOG.error("Could not delete truncate test file.", e);
                throw new RuntimeException("Could not delete truncate test file.", e);
            }
        }
        return m;
    }

    @Override
    public void notifyCheckpointComplete(long checkpointId) throws Exception {
        synchronized (bucketState.pendingFilesPerCheckpoint) {
            Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
            Set<Long> checkpointsToRemove = new HashSet<>();
            for (Long pastCheckpointId : pastCheckpointIds) {
                if (pastCheckpointId <= checkpointId) {
                    LOG.debug("Moving pending files to final location for checkpoint {}", pastCheckpointId);
                    // All the pending files are buckets that have been completed but are waiting to be renamed
                    // to their final name
                    for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                        Path finalPath = new Path(filename);
                        Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName())
                                .suffix(pendingSuffix);

                        FileSystem fs = pendingPath.getFileSystem(new org.apache.hadoop.conf.Configuration());
                        fs.rename(pendingPath, finalPath);
                        LOG.debug("Moving pending file {} to final location after complete checkpoint {}.",
                                pendingPath, pastCheckpointId);
                    }
                    checkpointsToRemove.add(pastCheckpointId);
                }
            }
            for (Long toRemove : checkpointsToRemove) {
                bucketState.pendingFilesPerCheckpoint.remove(toRemove);
            }
        }
    }

    @Override
    public BucketState snapshotState(long checkpointId, long checkpointTimestamp) throws Exception {
        if (isWriterOpen) {
            long pos = writer.flush();
            bucketState.currentFile = currentPartPath.toString();
            bucketState.currentFileValidLength = pos;
        }
        synchronized (bucketState.pendingFilesPerCheckpoint) {
            bucketState.pendingFilesPerCheckpoint.put(checkpointId, bucketState.pendingFiles);
        }
        bucketState.pendingFiles = new ArrayList<>();
        return bucketState;
    }

    @Override
    public void restoreState(BucketState state) {
        bucketState = state;
        // we can clean all the pending files since they where renamed to final files
        // after this checkpoint was successfull
        bucketState.pendingFiles.clear();
        FileSystem fs = null;
        try {
            fs = new Path(basePath).getFileSystem(new org.apache.hadoop.conf.Configuration());
        } catch (IOException e) {
            LOG.error("Error while creating FileSystem in checkpoint restore.", e);
            throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e);
        }
        if (bucketState.currentFile != null) {
            // We were writing to a file when the last checkpoint occured. This file can either
            // be still in-progress or became a pending file at some point after the checkpoint.
            // Either way, we have to truncate it back to a valid state (or write a .valid-length)
            // file that specifies up to which length it is valid and rename it to the final name
            // before starting a new bucket file.
            Path partPath = new Path(bucketState.currentFile);
            try {
                Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName())
                        .suffix(pendingSuffix);
                Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName())
                        .suffix(inProgressSuffix);

                if (fs.exists(partPendingPath)) {
                    LOG.debug(
                            "In-progress file {} has been moved to pending after checkpoint, moving to final location.",
                            partPath);
                    // has been moved to pending in the mean time, rename to final location
                    fs.rename(partPendingPath, partPath);
                } else if (fs.exists(partInProgressPath)) {
                    LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath);
                    // it was still in progress, rename to final path
                    fs.rename(partInProgressPath, partPath);
                } else if (fs.exists(partPath)) {
                    LOG.debug("In-Progress file {} was already moved to final location {}.",
                            bucketState.currentFile, partPath);
                } else {
                    LOG.debug(
                            "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, "
                                    + "it was moved to final location by a previous snapshot restore",
                            bucketState.currentFile);
                }

                refTruncate = reflectTruncate(fs);
                // truncate it or write a ".valid-length" file to specify up to which point it is valid
                if (refTruncate != null) {
                    LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength);
                    // some-one else might still hold the lease from a previous try, we are
                    // recovering, after all ...
                    if (fs instanceof DistributedFileSystem) {
                        DistributedFileSystem dfs = (DistributedFileSystem) fs;
                        LOG.debug("Trying to recover file lease {}", partPath);
                        dfs.recoverLease(partPath);
                        boolean isclosed = dfs.isFileClosed(partPath);
                        StopWatch sw = new StopWatch();
                        sw.start();
                        while (!isclosed) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            isclosed = dfs.isFileClosed(partPath);
                        }
                    }
                    Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath,
                            bucketState.currentFileValidLength);
                    if (!truncated) {
                        LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath);

                        // we must wait for the asynchronous truncate operation to complete
                        StopWatch sw = new StopWatch();
                        sw.start();
                        long newLen = fs.getFileStatus(partPath).getLen();
                        while (newLen != bucketState.currentFileValidLength) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            newLen = fs.getFileStatus(partPath).getLen();
                        }
                        if (newLen != bucketState.currentFileValidLength) {
                            throw new RuntimeException("Truncate did not truncate to right length. Should be "
                                    + bucketState.currentFileValidLength + " is " + newLen + ".");
                        }
                    }

                } else {
                    LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath,
                            bucketState.currentFileValidLength);
                    Path validLengthFilePath = new Path(partPath.getParent(),
                            validLengthPrefix + partPath.getName()).suffix(validLengthSuffix);
                    if (!fs.exists(validLengthFilePath)) {
                        FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath);
                        lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength));
                        lengthFileOut.close();
                    }
                }

                // invalidate in the state object
                bucketState.currentFile = null;
                bucketState.currentFileValidLength = -1;
            } catch (IOException e) {
                LOG.error("Error while restoring RollingSink state.", e);
                throw new RuntimeException("Error while restoring RollingSink state.", e);
            } catch (InvocationTargetException | IllegalAccessException e) {
                LOG.error("Cound not invoke truncate.", e);
                throw new RuntimeException("Could not invoke truncate.", e);
            }
        }

        LOG.debug("Clearing pending/in-progress files.");

        // Move files that are confirmed by a checkpoint but did not get moved to final location
        // because the checkpoint notification did not happen before a failure

        Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
        LOG.debug("Moving pending files to final location on restore.");
        for (Long pastCheckpointId : pastCheckpointIds) {
            // All the pending files are buckets that have been completed but are waiting to be renamed
            // to their final name
            for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                Path finalPath = new Path(filename);
                Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName())
                        .suffix(pendingSuffix);

                try {
                    if (fs.exists(pendingPath)) {
                        LOG.debug(
                                "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.",
                                pendingPath, pastCheckpointId);
                        fs.rename(pendingPath, finalPath);
                    }
                } catch (IOException e) {
                    LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}", pendingPath,
                            finalPath, e);
                    throw new RuntimeException(
                            "Error while renaming pending file " + pendingPath + " to final path " + finalPath, e);
                }
            }
        }
        bucketState.pendingFiles.clear();
        synchronized (bucketState.pendingFilesPerCheckpoint) {
            bucketState.pendingFilesPerCheckpoint.clear();
        }

        // we need to get this here since open() has not yet been called
        int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
        // delete pending files
        try {

            RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true);

            while (bucketFiles.hasNext()) {
                LocatedFileStatus file = bucketFiles.next();
                if (file.getPath().toString().endsWith(pendingSuffix)) {
                    // only delete files that contain our subtask index
                    if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                        LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString());
                        fs.delete(file.getPath(), true);
                    }
                }
                if (file.getPath().toString().endsWith(inProgressSuffix)) {
                    // only delete files that contain our subtask index
                    if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                        LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString());
                        fs.delete(file.getPath(), true);
                    }
                }
            }
        } catch (IOException e) {
            LOG.error("Error while deleting old pending files: {}", e);
            throw new RuntimeException("Error while deleting old pending files.", e);
        }
    }

    // --------------------------------------------------------------------------------------------
    //  Setters for User configuration values
    // --------------------------------------------------------------------------------------------

    /**
     * Sets the maximum bucket size in bytes.
     *
     * <p>
     * When a bucket part file becomes larger than this size a new bucket part file is started and
     * the old one is closed. The name of the bucket files depends on the {@link Bucketer}.
     *
     * @param batchSize The bucket part file size in bytes.
     */
    public RollingSink<T> setBatchSize(long batchSize) {
        this.batchSize = batchSize;
        return this;
    }

    /**
     * Sets the {@link Bucketer} to use for determining the bucket files to write to.
     *
     * @param bucketer The bucketer to use.
     */
    public RollingSink<T> setBucketer(Bucketer bucketer) {
        this.bucketer = bucketer;
        return this;
    }

    /**
     * Sets the {@link Writer} to be used for writing the incoming elements to bucket files.
     *
     * @param writer The {@code Writer} to use.
     */
    public RollingSink<T> setWriter(Writer<T> writer) {
        this.writerTemplate = writer;
        return this;
    }

    /**
     * Sets the suffix of in-progress part files. The default is {@code "in-progress"}.
     */
    public RollingSink<T> setInProgressSuffix(String inProgressSuffix) {
        this.inProgressSuffix = inProgressSuffix;
        return this;
    }

    /**
     * Sets the prefix of in-progress part files. The default is {@code "_"}.
     */
    public RollingSink<T> setInProgressPrefix(String inProgressPrefix) {
        this.inProgressPrefix = inProgressPrefix;
        return this;
    }

    /**
     * Sets the suffix of pending part files. The default is {@code ".pending"}.
     */
    public RollingSink<T> setPendingSuffix(String pendingSuffix) {
        this.pendingSuffix = pendingSuffix;
        return this;
    }

    /**
     * Sets the prefix of pending part files. The default is {@code "_"}.
     */
    public RollingSink<T> setPendingPrefix(String pendingPrefix) {
        this.pendingPrefix = pendingPrefix;
        return this;
    }

    /**
     * Sets the suffix of valid-length files. The default is {@code ".valid-length"}.
     */
    public RollingSink<T> setValidLengthSuffix(String validLengthSuffix) {
        this.validLengthSuffix = validLengthSuffix;
        return this;
    }

    /**
     * Sets the prefix of valid-length files. The default is {@code "_"}.
     */
    public RollingSink<T> setValidLengthPrefix(String validLengthPrefix) {
        this.validLengthPrefix = validLengthPrefix;
        return this;
    }

    /**
     * Sets the prefix of part files.  The default is {@code "part"}.
     */
    public RollingSink<T> setPartPrefix(String partPrefix) {
        this.partPrefix = partPrefix;
        return this;
    }

    /**
     * Disable cleanup of leftover in-progress/pending files when the sink is opened.
     *
     * <p>
     * This should only be disabled if using the sink without checkpoints, to not remove
     * the files already in the directory.
     */
    public RollingSink<T> disableCleanupOnOpen() {
        this.cleanupOnOpen = false;
        return this;
    }

    /**
     * Sets the default timeout for asynchronous operations such as recoverLease and truncate.
     *
     * @param timeout The timeout, in milliseconds.
     */
    public RollingSink<T> setAsyncTimeout(long timeout) {
        this.asyncTimeout = timeout;
        return this;
    }

    // --------------------------------------------------------------------------------------------
    //  Internal Classes
    // --------------------------------------------------------------------------------------------

    /**
     * This is used for keeping track of the current in-progress files and files that we mark
     * for moving from pending to final location after we get a checkpoint-complete notification.
     */
    static final class BucketState implements Serializable {
        private static final long serialVersionUID = 1L;

        /**
         * The file that was in-progress when the last checkpoint occured.
         */
        String currentFile = null;

        /**
         * The valid length of the in-progress file at the time of the last checkpoint.
         */
        long currentFileValidLength = -1;

        /**
         * Pending files that accumulated since the last checkpoint.
         */
        List<String> pendingFiles = new ArrayList<>();

        /**
         * When doing a checkpoint we move the pending files since the last checkpoint to this map
         * with the id of the checkpoint. When we get the checkpoint-complete notification we move
         * pending files of completed checkpoints to their final location.
         */
        final Map<Long, List<String>> pendingFilesPerCheckpoint = new HashMap<>();
    }
}