org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.streaming.connectors.fs.bucketing;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileConstants;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.commons.io.FileUtils;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.operators.StreamSink;
import org.apache.flink.streaming.connectors.fs.AvroKeyValueSinkWriter;
import org.apache.flink.streaming.connectors.fs.Clock;
import org.apache.flink.streaming.connectors.fs.SequenceFileWriter;
import org.apache.flink.streaming.connectors.fs.StringWriter;
import org.apache.flink.streaming.connectors.fs.Writer;
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
import org.apache.flink.streaming.runtime.tasks.OperatorStateHandles;
import org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness;
import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
import org.apache.flink.util.NetUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.Assert;
import org.junit.rules.TemporaryFolder;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

public class BucketingSinkTest {
    @ClassRule
    public static TemporaryFolder tempFolder = new TemporaryFolder();

    private static MiniDFSCluster hdfsCluster;
    private static org.apache.hadoop.fs.FileSystem dfs;
    private static String hdfsURI;

    private static final String PART_PREFIX = "part";
    private static final String PENDING_SUFFIX = ".pending";
    private static final String IN_PROGRESS_SUFFIX = ".in-progress";
    private static final String VALID_LENGTH_SUFFIX = ".valid";

    private OneInputStreamOperatorTestHarness<String, Object> createRescalingTestSink(File outDir,
            int totalParallelism, int taskIdx, long inactivityInterval) throws Exception {

        BucketingSink<String> sink = new BucketingSink<String>(outDir.getAbsolutePath())
                .setBucketer(new Bucketer<String>() {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public Path getBucketPath(Clock clock, Path basePath, String element) {
                        return new Path(basePath, element);
                    }
                }).setWriter(new StringWriter<String>()).setInactiveBucketCheckInterval(inactivityInterval)
                .setInactiveBucketThreshold(inactivityInterval).setPartPrefix(PART_PREFIX).setInProgressPrefix("")
                .setPendingPrefix("").setValidLengthPrefix("").setInProgressSuffix(IN_PROGRESS_SUFFIX)
                .setPendingSuffix(PENDING_SUFFIX).setValidLengthSuffix(VALID_LENGTH_SUFFIX);

        return createTestSink(sink, totalParallelism, taskIdx);
    }

    private OneInputStreamOperatorTestHarness<String, Object> createTestSink(File dataDir, int totalParallelism,
            int taskIdx) throws Exception {
        BucketingSink<String> sink = new BucketingSink<String>(dataDir.getAbsolutePath())
                .setBucketer(new Bucketer<String>() {
                    private static final long serialVersionUID = 1L;

                    @Override
                    public Path getBucketPath(Clock clock, Path basePath, String element) {
                        return new Path(basePath, element);
                    }
                }).setWriter(new StringWriter<String>()).setPartPrefix(PART_PREFIX).setPendingPrefix("")
                .setInactiveBucketCheckInterval(5 * 60 * 1000L).setInactiveBucketThreshold(5 * 60 * 1000L)
                .setPendingSuffix(PENDING_SUFFIX).setInProgressSuffix(IN_PROGRESS_SUFFIX);

        return createTestSink(sink, totalParallelism, taskIdx);
    }

    private <T> OneInputStreamOperatorTestHarness<T, Object> createTestSink(BucketingSink<T> sink,
            int totalParallelism, int taskIdx) throws Exception {
        return new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink), 10, totalParallelism, taskIdx);
    }

    @BeforeClass
    public static void createHDFS() throws IOException {
        Configuration conf = new Configuration();

        File dataDir = tempFolder.newFolder();

        conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, dataDir.getAbsolutePath());
        MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
        hdfsCluster = builder.build();

        dfs = hdfsCluster.getFileSystem();

        hdfsURI = "hdfs://"
                + NetUtils.hostAndPortToUrlString(hdfsCluster.getURI().getHost(), hdfsCluster.getNameNodePort())
                + "/";
    }

    @AfterClass
    public static void destroyHDFS() {
        hdfsCluster.shutdown();
    }

    @Test
    public void testInactivityPeriodWithLateNotify() throws Exception {
        final File outDir = tempFolder.newFolder();

        OneInputStreamOperatorTestHarness<String, Object> testHarness = createRescalingTestSink(outDir, 1, 0, 100);
        testHarness.setup();
        testHarness.open();

        testHarness.setProcessingTime(0L);

        testHarness.processElement(new StreamRecord<>("test1", 1L));
        testHarness.processElement(new StreamRecord<>("test2", 1L));
        checkFs(outDir, 2, 0, 0, 0);

        testHarness.setProcessingTime(101L); // put some in pending
        checkFs(outDir, 0, 2, 0, 0);

        testHarness.snapshot(0, 0); // put them in pending for 0
        checkFs(outDir, 0, 2, 0, 0);

        testHarness.processElement(new StreamRecord<>("test3", 1L));
        testHarness.processElement(new StreamRecord<>("test4", 1L));

        testHarness.setProcessingTime(202L); // put some in pending

        testHarness.snapshot(1, 0); // put them in pending for 1
        checkFs(outDir, 0, 4, 0, 0);

        testHarness.notifyOfCompletedCheckpoint(0); // put the pending for 0 to the "committed" state
        checkFs(outDir, 0, 2, 2, 0);

        testHarness.notifyOfCompletedCheckpoint(1); // put the pending for 1 to the "committed" state
        checkFs(outDir, 0, 0, 4, 0);
    }

    @Test
    public void testBucketStateTransitions() throws Exception {
        final File outDir = tempFolder.newFolder();

        OneInputStreamOperatorTestHarness<String, Object> testHarness = createRescalingTestSink(outDir, 1, 0, 100);
        testHarness.setup();
        testHarness.open();

        testHarness.setProcessingTime(0L);

        testHarness.processElement(new StreamRecord<>("test1", 1L));
        testHarness.processElement(new StreamRecord<>("test2", 1L));
        checkFs(outDir, 2, 0, 0, 0);

        // this is to check the inactivity threshold
        testHarness.setProcessingTime(101L);
        checkFs(outDir, 0, 2, 0, 0);

        testHarness.processElement(new StreamRecord<>("test3", 1L));
        checkFs(outDir, 1, 2, 0, 0);

        testHarness.snapshot(0, 0);
        checkFs(outDir, 1, 2, 0, 0);

        testHarness.notifyOfCompletedCheckpoint(0);
        checkFs(outDir, 1, 0, 2, 0);

        OperatorStateHandles snapshot = testHarness.snapshot(1, 0);

        testHarness.close();
        checkFs(outDir, 0, 1, 2, 0);

        testHarness = createRescalingTestSink(outDir, 1, 0, 100);
        testHarness.setup();
        testHarness.initializeState(snapshot);
        testHarness.open();
        checkFs(outDir, 0, 0, 3, 1);

        snapshot = testHarness.snapshot(2, 0);

        testHarness.processElement(new StreamRecord<>("test4", 10));
        checkFs(outDir, 1, 0, 3, 1);

        testHarness = createRescalingTestSink(outDir, 1, 0, 100);
        testHarness.setup();
        testHarness.initializeState(snapshot);
        testHarness.open();

        // the in-progress file remains as we do not clean up now
        checkFs(outDir, 1, 0, 3, 1);

        testHarness.close();

        // at close it is not moved to final because it is not part
        // of the current task's state, it was just a not cleaned up leftover.
        checkFs(outDir, 1, 0, 3, 1);
    }

    @Test
    public void testSameParallelismWithShufflingStates() throws Exception {
        final File outDir = tempFolder.newFolder();

        OneInputStreamOperatorTestHarness<String, Object> testHarness1 = createRescalingTestSink(outDir, 2, 0, 100);
        testHarness1.setup();
        testHarness1.open();

        OneInputStreamOperatorTestHarness<String, Object> testHarness2 = createRescalingTestSink(outDir, 2, 1, 100);
        testHarness2.setup();
        testHarness2.open();

        testHarness1.processElement(new StreamRecord<>("test1", 0L));
        checkFs(outDir, 1, 0, 0, 0);

        testHarness2.processElement(new StreamRecord<>("test2", 0L));
        checkFs(outDir, 2, 0, 0, 0);

        // intentionally we snapshot them in the reverse order so that the states are shuffled
        OperatorStateHandles mergedSnapshot = AbstractStreamOperatorTestHarness
                .repackageState(testHarness2.snapshot(0, 0), testHarness1.snapshot(0, 0));

        checkFs(outDir, 2, 0, 0, 0);

        // this will not be included in any checkpoint so it can be cleaned up (although we do not)
        testHarness2.processElement(new StreamRecord<>("test3", 0L));
        checkFs(outDir, 3, 0, 0, 0);

        testHarness1 = createRescalingTestSink(outDir, 2, 0, 100);
        testHarness1.setup();
        testHarness1.initializeState(mergedSnapshot);
        testHarness1.open();

        // the one in-progress will be the one assigned to the next instance,
        // the other is the test3 which is just not cleaned up
        checkFs(outDir, 2, 0, 1, 1);

        testHarness2 = createRescalingTestSink(outDir, 2, 1, 100);
        testHarness2.setup();
        testHarness2.initializeState(mergedSnapshot);
        testHarness2.open();

        checkFs(outDir, 1, 0, 2, 2);

        testHarness1.close();
        testHarness2.close();

        // the 1 in-progress can be discarded.
        checkFs(outDir, 1, 0, 2, 2);
    }

    @Test
    public void testScalingDown() throws Exception {
        final File outDir = tempFolder.newFolder();

        OneInputStreamOperatorTestHarness<String, Object> testHarness1 = createRescalingTestSink(outDir, 3, 0, 100);
        testHarness1.setup();
        testHarness1.open();

        OneInputStreamOperatorTestHarness<String, Object> testHarness2 = createRescalingTestSink(outDir, 3, 1, 100);
        testHarness2.setup();
        testHarness2.open();

        OneInputStreamOperatorTestHarness<String, Object> testHarness3 = createRescalingTestSink(outDir, 3, 2, 100);
        testHarness3.setup();
        testHarness3.open();

        testHarness1.processElement(new StreamRecord<>("test1", 0L));
        checkFs(outDir, 1, 0, 0, 0);

        testHarness2.processElement(new StreamRecord<>("test2", 0L));
        checkFs(outDir, 2, 0, 0, 0);

        testHarness3.processElement(new StreamRecord<>("test3", 0L));
        testHarness3.processElement(new StreamRecord<>("test4", 0L));
        checkFs(outDir, 4, 0, 0, 0);

        // intentionally we snapshot them in the reverse order so that the states are shuffled
        OperatorStateHandles mergedSnapshot = AbstractStreamOperatorTestHarness.repackageState(
                testHarness3.snapshot(0, 0), testHarness1.snapshot(0, 0), testHarness2.snapshot(0, 0));

        testHarness1 = createRescalingTestSink(outDir, 2, 0, 100);
        testHarness1.setup();
        testHarness1.initializeState(mergedSnapshot);
        testHarness1.open();

        checkFs(outDir, 1, 0, 3, 3);

        testHarness2 = createRescalingTestSink(outDir, 2, 1, 100);
        testHarness2.setup();
        testHarness2.initializeState(mergedSnapshot);
        testHarness2.open();

        checkFs(outDir, 0, 0, 4, 4);
    }

    @Test
    public void testScalingUp() throws Exception {
        final File outDir = tempFolder.newFolder();

        OneInputStreamOperatorTestHarness<String, Object> testHarness1 = createRescalingTestSink(outDir, 2, 0, 100);
        testHarness1.setup();
        testHarness1.open();

        OneInputStreamOperatorTestHarness<String, Object> testHarness2 = createRescalingTestSink(outDir, 2, 0, 100);
        testHarness2.setup();
        testHarness2.open();

        testHarness1.processElement(new StreamRecord<>("test1", 1L));
        testHarness1.processElement(new StreamRecord<>("test2", 1L));

        checkFs(outDir, 2, 0, 0, 0);

        testHarness2.processElement(new StreamRecord<>("test3", 1L));
        testHarness2.processElement(new StreamRecord<>("test4", 1L));
        testHarness2.processElement(new StreamRecord<>("test5", 1L));

        checkFs(outDir, 5, 0, 0, 0);

        // intentionally we snapshot them in the reverse order so that the states are shuffled
        OperatorStateHandles mergedSnapshot = AbstractStreamOperatorTestHarness
                .repackageState(testHarness2.snapshot(0, 0), testHarness1.snapshot(0, 0));

        testHarness1 = createRescalingTestSink(outDir, 3, 0, 100);
        testHarness1.setup();
        testHarness1.initializeState(mergedSnapshot);
        testHarness1.open();

        checkFs(outDir, 2, 0, 3, 3);

        testHarness2 = createRescalingTestSink(outDir, 3, 1, 100);
        testHarness2.setup();
        testHarness2.initializeState(mergedSnapshot);
        testHarness2.open();

        checkFs(outDir, 0, 0, 5, 5);

        OneInputStreamOperatorTestHarness<String, Object> testHarness3 = createRescalingTestSink(outDir, 3, 2, 100);
        testHarness3.setup();
        testHarness3.initializeState(mergedSnapshot);
        testHarness3.open();

        checkFs(outDir, 0, 0, 5, 5);

        testHarness1.processElement(new StreamRecord<>("test6", 0));
        testHarness2.processElement(new StreamRecord<>("test6", 0));
        testHarness3.processElement(new StreamRecord<>("test6", 0));

        checkFs(outDir, 3, 0, 5, 5);

        testHarness1.snapshot(1, 0);
        testHarness2.snapshot(1, 0);
        testHarness3.snapshot(1, 0);

        testHarness1.close();
        testHarness2.close();
        testHarness3.close();

        checkFs(outDir, 0, 3, 5, 5);
    }

    private void checkFs(File outDir, int inprogress, int pending, int completed, int valid) throws IOException {
        int inProg = 0;
        int pend = 0;
        int compl = 0;
        int val = 0;

        for (File file : FileUtils.listFiles(outDir, null, true)) {
            if (file.getAbsolutePath().endsWith("crc")) {
                continue;
            }
            String path = file.getPath();
            if (path.endsWith(IN_PROGRESS_SUFFIX)) {
                inProg++;
            } else if (path.endsWith(PENDING_SUFFIX)) {
                pend++;
            } else if (path.endsWith(VALID_LENGTH_SUFFIX)) {
                val++;
            } else if (path.contains(PART_PREFIX)) {
                compl++;
            }
        }

        Assert.assertEquals(inprogress, inProg);
        Assert.assertEquals(pending, pend);
        Assert.assertEquals(completed, compl);
        Assert.assertEquals(valid, val);
    }

    /**
     * This tests {@link StringWriter} with
     * non-bucketing output.
     */
    @Test
    public void testNonRollingStringWriter() throws Exception {
        final String outPath = hdfsURI + "/string-non-rolling-out";

        final int numElements = 20;

        BucketingSink<String> sink = new BucketingSink<String>(outPath).setBucketer(new BasePathBucketer<String>())
                .setPartPrefix(PART_PREFIX).setPendingPrefix("").setPendingSuffix("");

        OneInputStreamOperatorTestHarness<String, Object> testHarness = createTestSink(sink, 1, 0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElements; i++) {
            testHarness.processElement(new StreamRecord<>("message #" + Integer.toString(i)));
        }

        testHarness.close();

        FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

        BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

        for (int i = 0; i < numElements; i++) {
            String line = br.readLine();
            Assert.assertEquals("message #" + i, line);
        }

        inStream.close();
    }

    /**
     * This tests {@link SequenceFileWriter}
     * with non-rolling output and without compression.
     */
    @Test
    public void testNonRollingSequenceFileWithoutCompressionWriter() throws Exception {
        final String outPath = hdfsURI + "/seq-no-comp-non-rolling-out";

        final int numElements = 20;

        BucketingSink<Tuple2<IntWritable, Text>> sink = new BucketingSink<Tuple2<IntWritable, Text>>(outPath)
                .setWriter(new SequenceFileWriter<IntWritable, Text>())
                .setBucketer(new BasePathBucketer<Tuple2<IntWritable, Text>>()).setPartPrefix(PART_PREFIX)
                .setPendingPrefix("").setPendingSuffix("");

        sink.setInputType(TypeInformation.of(new TypeHint<Tuple2<IntWritable, Text>>() {
        }), new ExecutionConfig());

        OneInputStreamOperatorTestHarness<Tuple2<IntWritable, Text>, Object> testHarness = createTestSink(sink, 1,
                0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElements; i++) {
            testHarness.processElement(
                    new StreamRecord<>(Tuple2.of(new IntWritable(i), new Text("message #" + Integer.toString(i)))));
        }

        testHarness.close();

        FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

        SequenceFile.Reader reader = new SequenceFile.Reader(inStream, 1000, 0, 100000, new Configuration());

        IntWritable intWritable = new IntWritable();
        Text txt = new Text();

        for (int i = 0; i < numElements; i++) {
            reader.next(intWritable, txt);
            Assert.assertEquals(i, intWritable.get());
            Assert.assertEquals("message #" + i, txt.toString());
        }

        reader.close();
        inStream.close();
    }

    /**
     * This tests {@link AvroKeyValueSinkWriter}
     * with non-rolling output and with compression.
     */
    @Test
    public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
        final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";

        final int numElements = 20;

        Map<String, String> properties = new HashMap<>();
        Schema keySchema = Schema.create(Schema.Type.INT);
        Schema valueSchema = Schema.create(Schema.Type.STRING);
        properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
        properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
        properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
        properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

        BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
                .setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties))
                .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()).setPartPrefix(PART_PREFIX)
                .setPendingPrefix("").setPendingSuffix("");

        OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElements; i++) {
            testHarness.processElement(new StreamRecord<>(Tuple2.of(i, "message #" + Integer.toString(i))));
        }

        testHarness.close();

        GenericData.setStringType(valueSchema, GenericData.StringType.String);
        Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

        FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

        SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
        for (int i = 0; i < numElements; i++) {
            AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(
                    dataFileStream.next());
            int key = wrappedEntry.getKey();
            Assert.assertEquals(i, key);
            String value = wrappedEntry.getValue();
            Assert.assertEquals("message #" + i, value);
        }

        dataFileStream.close();
        inStream.close();
    }

    /**
     * This uses {@link DateTimeBucketer} to
     * produce rolling files. We use {@link OneInputStreamOperatorTestHarness} to manually
     * advance processing time.
     */
    @Test
    public void testDateTimeRollingStringWriter() throws Exception {
        final int numElements = 20;

        final String outPath = hdfsURI + "/rolling-out";

        BucketingSink<String> sink = new BucketingSink<String>(outPath)
                .setBucketer(new DateTimeBucketer<String>("ss")).setPartPrefix(PART_PREFIX).setPendingPrefix("")
                .setPendingSuffix("");

        OneInputStreamOperatorTestHarness<String, Object> testHarness = createTestSink(sink, 1, 0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElements; i++) {
            // Every 5 elements, increase the clock time. We should end up with 5 elements per bucket.
            if (i % 5 == 0) {
                testHarness.setProcessingTime(i * 1000L);
            }
            testHarness.processElement(new StreamRecord<>("message #" + Integer.toString(i)));
        }

        testHarness.close();

        RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

        // We should have 4 rolling files across 4 time intervals
        int numFiles = 0;
        while (files.hasNext()) {
            LocatedFileStatus file = files.next();
            numFiles++;
            if (file.getPath().toString().contains("rolling-out/00")) {
                FSDataInputStream inStream = dfs.open(file.getPath());

                BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

                for (int i = 0; i < 5; i++) {
                    String line = br.readLine();
                    Assert.assertEquals("message #" + i, line);
                }

                inStream.close();
            } else if (file.getPath().toString().contains("rolling-out/05")) {
                FSDataInputStream inStream = dfs.open(file.getPath());

                BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

                for (int i = 5; i < 10; i++) {
                    String line = br.readLine();
                    Assert.assertEquals("message #" + i, line);
                }

                inStream.close();
            } else if (file.getPath().toString().contains("rolling-out/10")) {
                FSDataInputStream inStream = dfs.open(file.getPath());

                BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

                for (int i = 10; i < 15; i++) {
                    String line = br.readLine();
                    Assert.assertEquals("message #" + i, line);
                }

                inStream.close();
            } else if (file.getPath().toString().contains("rolling-out/15")) {
                FSDataInputStream inStream = dfs.open(file.getPath());

                BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

                for (int i = 15; i < 20; i++) {
                    String line = br.readLine();
                    Assert.assertEquals("message #" + i, line);
                }

                inStream.close();
            } else {
                Assert.fail("File " + file + " does not match any expected roll pattern.");
            }
        }

        Assert.assertEquals(4, numFiles);
    }

    /**
     * This uses a custom bucketing function which determines the bucket from the input.
     */
    @Test
    public void testCustomBucketing() throws Exception {
        File dataDir = tempFolder.newFolder();

        final int numIds = 4;
        final int numElements = 20;

        OneInputStreamOperatorTestHarness<String, Object> testHarness = createTestSink(dataDir, 1, 0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElements; i++) {
            testHarness.processElement(new StreamRecord<>(Integer.toString(i % numIds)));
        }

        testHarness.close();

        // we should have 4 buckets, with 1 file each
        int numFiles = 0;
        for (File file : FileUtils.listFiles(dataDir, null, true)) {
            if (file.getName().startsWith(PART_PREFIX)) {
                numFiles++;
            }
        }

        Assert.assertEquals(4, numFiles);
    }

    /**
     * This uses a custom bucketing function which determines the bucket from the input.
     * We use a simulated clock to reduce the number of buckets being written to over time.
     * This causes buckets to become 'inactive' and their file parts 'closed' by the sink.
     */
    @Test
    public void testCustomBucketingInactiveBucketCleanup() throws Exception {
        File dataDir = tempFolder.newFolder();

        final int step1NumIds = 4;
        final int step2NumIds = 2;
        final int numElementsPerStep = 20;

        OneInputStreamOperatorTestHarness<String, Object> testHarness = createTestSink(dataDir, 1, 0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElementsPerStep; i++) {
            testHarness.processElement(new StreamRecord<>(Integer.toString(i % step1NumIds)));
        }

        testHarness.setProcessingTime(2 * 60 * 1000L);

        for (int i = 0; i < numElementsPerStep; i++) {
            testHarness.processElement(new StreamRecord<>(Integer.toString(i % step2NumIds)));
        }

        testHarness.setProcessingTime(6 * 60 * 1000L);

        for (int i = 0; i < numElementsPerStep; i++) {
            testHarness.processElement(new StreamRecord<>(Integer.toString(i % step2NumIds)));
        }

        // we should have 4 buckets, with 1 file each
        // 2 of these buckets should have been finalised due to becoming inactive
        int numFiles = 0;
        int numInProgress = 0;
        for (File file : FileUtils.listFiles(dataDir, null, true)) {
            if (file.getAbsolutePath().endsWith("crc")) {
                continue;
            }
            if (file.getPath().endsWith(IN_PROGRESS_SUFFIX)) {
                numInProgress++;
            }
            numFiles++;
        }

        testHarness.close();

        Assert.assertEquals(4, numFiles);
        Assert.assertEquals(2, numInProgress);
    }

    /**
     * This tests user defined hdfs configuration
     * @throws Exception
     */
    @Test
    public void testUserDefinedConfiguration() throws Exception {
        final String outPath = hdfsURI + "/string-non-rolling-with-config";
        final int numElements = 20;

        Map<String, String> properties = new HashMap<>();
        Schema keySchema = Schema.create(Schema.Type.INT);
        Schema valueSchema = Schema.create(Schema.Type.STRING);
        properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
        properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
        properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
        properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

        Configuration conf = new Configuration();
        conf.set("io.file.buffer.size", "40960");

        BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
                .setFSConfig(conf)
                .setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size",
                        "40960"))
                .setBucketer(new BasePathBucketer<Tuple2<Integer, String>>()).setPartPrefix(PART_PREFIX)
                .setPendingPrefix("").setPendingSuffix("");

        OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness = createTestSink(sink, 1, 0);

        testHarness.setProcessingTime(0L);

        testHarness.setup();
        testHarness.open();

        for (int i = 0; i < numElements; i++) {
            testHarness.processElement(new StreamRecord<>(Tuple2.of(i, "message #" + Integer.toString(i))));
        }

        testHarness.close();

        GenericData.setStringType(valueSchema, GenericData.StringType.String);
        Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

        FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

        SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
        for (int i = 0; i < numElements; i++) {
            AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry = new AvroKeyValueSinkWriter.AvroKeyValue<>(
                    dataFileStream.next());
            int key = wrappedEntry.getKey();
            Assert.assertEquals(i, key);
            String value = wrappedEntry.getValue();
            Assert.assertEquals("message #" + i, value);
        }

        dataFileStream.close();
        inStream.close();
    }

    private static class StreamWriterWithConfigCheck<K, V> extends AvroKeyValueSinkWriter<K, V> {
        private Map<String, String> properties;
        private String key;
        private String expect;

        public StreamWriterWithConfigCheck(Map<String, String> properties, String key, String expect) {
            super(properties);
            this.properties = properties;
            this.key = key;
            this.expect = expect;
        }

        @Override
        public void open(FileSystem fs, Path path) throws IOException {
            super.open(fs, path);
            Assert.assertEquals(expect, fs.getConf().get(key));
        }

        @Override
        public Writer<Tuple2<K, V>> duplicate() {
            return new StreamWriterWithConfigCheck<>(properties, key, expect);
        }
    }

}