org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultTolerance2ITCase.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.flink.streaming.connectors.fs.bucketing.BucketingSinkFaultTolerance2ITCase.java

Source

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.connectors.fs.bucketing;

import com.google.common.collect.Sets;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.streaming.api.checkpoint.CheckpointedAsynchronously;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import org.apache.flink.test.checkpointing.StreamFaultToleranceTestBase;
import org.apache.flink.util.NetUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.ClassRule;
import org.junit.rules.TemporaryFolder;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.util.Random;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.junit.Assert.assertTrue;

/**
* Tests for {@link BucketingSink}.
*
* <p>
* This test only verifies the exactly once behaviour of the sink. Another test tests the
* rolling behaviour.
*
* <p>
* This differs from BucketingSinkFaultToleranceITCase in that the checkpoint interval is extremely
* high. This provokes the case that the sink restarts without any checkpoint having been performed.
* This tests the initial cleanup of pending/in-progress files.
*/
public class BucketingSinkFaultTolerance2ITCase extends StreamFaultToleranceTestBase {

    final long NUM_STRINGS = 16_000;

    @ClassRule
    public static TemporaryFolder tempFolder = new TemporaryFolder();

    private static MiniDFSCluster hdfsCluster;
    private static org.apache.hadoop.fs.FileSystem dfs;

    private static String outPath;

    @BeforeClass
    public static void createHDFS() throws IOException {
        Configuration conf = new Configuration();

        File dataDir = tempFolder.newFolder();

        conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, dataDir.getAbsolutePath());
        MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(conf);
        hdfsCluster = builder.build();

        dfs = hdfsCluster.getFileSystem();

        outPath = "hdfs://"
                + NetUtils.hostAndPortToUrlString(hdfsCluster.getURI().getHost(), hdfsCluster.getNameNodePort())
                + "/string-non-rolling-out-no-checkpoint";
    }

    @AfterClass
    public static void destroyHDFS() {
        if (hdfsCluster != null) {
            hdfsCluster.shutdown();
        }
    }

    @Override
    public void testProgram(StreamExecutionEnvironment env) {
        assertTrue("Broken test setup", NUM_STRINGS % 40 == 0);

        int PARALLELISM = 12;

        env.enableCheckpointing(Long.MAX_VALUE);
        env.setParallelism(PARALLELISM);
        env.disableOperatorChaining();

        DataStream<String> stream = env.addSource(new StringGeneratingSourceFunction(NUM_STRINGS)).startNewChain();

        DataStream<String> mapped = stream.map(new OnceFailingIdentityMapper(NUM_STRINGS));

        BucketingSink<String> sink = new BucketingSink<String>(outPath).setBucketer(new BasePathBucketer<String>())
                .setBatchSize(5000).setValidLengthPrefix("").setPendingPrefix("");

        mapped.addSink(sink);

    }

    @Override
    public void postSubmit() throws Exception {
        // We read the files and verify that we have read all the strings. If a valid-length
        // file exists we only read the file to that point. (This test should work with
        // FileSystems that support truncate() and with others as well.)

        Pattern messageRegex = Pattern.compile("message (\\d*)");

        // Keep a set of the message IDs that we read. The size must equal the read count and
        // the NUM_STRINGS. If numRead is bigger than the size of the set we have seen some
        // elements twice.
        Set<Integer> readNumbers = Sets.newHashSet();
        int numRead = 0;

        RemoteIterator<LocatedFileStatus> files = dfs.listFiles(new Path(outPath), true);

        while (files.hasNext()) {
            LocatedFileStatus file = files.next();

            if (!file.getPath().toString().endsWith(".valid-length")) {
                int validLength = (int) file.getLen();
                if (dfs.exists(file.getPath().suffix(".valid-length"))) {
                    FSDataInputStream inStream = dfs.open(file.getPath().suffix(".valid-length"));
                    String validLengthString = inStream.readUTF();
                    validLength = Integer.parseInt(validLengthString);
                    System.out.println("VALID LENGTH: " + validLength);
                }
                FSDataInputStream inStream = dfs.open(file.getPath());
                byte[] buffer = new byte[validLength];
                inStream.readFully(0, buffer, 0, validLength);
                inStream.close();

                ByteArrayInputStream bais = new ByteArrayInputStream(buffer);

                InputStreamReader inStreamReader = new InputStreamReader(bais);
                BufferedReader br = new BufferedReader(inStreamReader);

                String line = br.readLine();
                while (line != null) {
                    Matcher matcher = messageRegex.matcher(line);
                    if (matcher.matches()) {
                        numRead++;
                        int messageId = Integer.parseInt(matcher.group(1));
                        readNumbers.add(messageId);
                    } else {
                        Assert.fail("Read line does not match expected pattern.");
                    }
                    line = br.readLine();
                }
                br.close();
                inStreamReader.close();
                bais.close();
            }
        }

        // Verify that we read all strings (at-least-once)
        Assert.assertEquals(NUM_STRINGS, readNumbers.size());

        // Verify that we don't have duplicates (boom!, exactly-once)
        Assert.assertEquals(NUM_STRINGS, numRead);
    }

    private static class OnceFailingIdentityMapper extends RichMapFunction<String, String> {
        private static final long serialVersionUID = 1L;

        private static volatile boolean hasFailed = false;

        private final long numElements;

        private long failurePos;
        private long count;

        OnceFailingIdentityMapper(long numElements) {
            this.numElements = numElements;
        }

        @Override
        public void open(org.apache.flink.configuration.Configuration parameters) throws IOException {
            long failurePosMin = (long) (0.4 * numElements / getRuntimeContext().getNumberOfParallelSubtasks());
            long failurePosMax = (long) (0.7 * numElements / getRuntimeContext().getNumberOfParallelSubtasks());

            failurePos = (new Random().nextLong() % (failurePosMax - failurePosMin)) + failurePosMin;
            count = 0;
        }

        @Override
        public String map(String value) throws Exception {
            count++;
            if (!hasFailed && count >= failurePos) {
                hasFailed = true;
                throw new Exception("Test Failure");
            }

            return value;
        }
    }

    private static class StringGeneratingSourceFunction extends RichParallelSourceFunction<String>
            implements CheckpointedAsynchronously<Integer> {

        private static final long serialVersionUID = 1L;

        private final long numElements;

        private int index;

        private volatile boolean isRunning = true;

        StringGeneratingSourceFunction(long numElements) {
            this.numElements = numElements;
        }

        @Override
        public void run(SourceContext<String> ctx) throws Exception {
            final Object lockingObject = ctx.getCheckpointLock();

            final int step = getRuntimeContext().getNumberOfParallelSubtasks();

            if (index == 0) {
                index = getRuntimeContext().getIndexOfThisSubtask();
            }

            while (isRunning && index < numElements) {

                Thread.sleep(1);
                synchronized (lockingObject) {
                    ctx.collect("message " + index);
                    index += step;
                }
            }
        }

        @Override
        public void cancel() {
            isRunning = false;
        }

        private static String randomString(StringBuilder bld, Random rnd) {
            final int len = rnd.nextInt(10) + 5;

            for (int i = 0; i < len; i++) {
                char next = (char) (rnd.nextInt(20000) + 33);
                bld.append(next);
            }

            return bld.toString();
        }

        @Override
        public Integer snapshotState(long checkpointId, long checkpointTimestamp) {
            return index;
        }

        @Override
        public void restoreState(Integer state) {
            index = state;
        }
    }
}