com.streamsets.datacollector.hdfs.cluster.KafkaToHDFSIT.java Source code

Introduction

Here is the source code for com.streamsets.datacollector.hdfs.cluster.KafkaToHDFSIT.java
Source

/**
 * Copyright 2015 StreamSets Inc.
 *
 * Licensed under the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.streamsets.datacollector.hdfs.cluster;

import com.google.common.io.Resources;
import com.streamsets.datacollector.MiniSDC;
import com.streamsets.datacollector.util.ClusterUtil;
import com.streamsets.datacollector.util.TestUtil;
import com.streamsets.datacollector.util.VerifyUtils;
import com.streamsets.pipeline.kafka.common.KafkaTestUtil;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;

/**
 * The pipeline and data provider for this test case should make sure that the pipeline runs continuously and the origin
 *  in the pipeline keeps producing records until stopped.
 *
 * Origin has to be Kafka as of now. Make sure that there is continuous supply of data for the pipeline to keep running.
 * For example to test kafka Origin, a background thread could keep writing to the kafka topic from which the
 * kafka origin reads.
 *
 */
public class KafkaToHDFSIT {

    private static final Logger LOG = LoggerFactory.getLogger(KafkaToHDFSIT.class);
    //Kafka messages contain text "Hello Kafka<i>" i in the range [0-29]
    private static int RECORDS_PRODUCED = 30;
    //Based on the expression parser and stream selector target ends up with messages which have even first digit of i.
    //i.e., Hello Kafka<0,2,4,6,8,20,21,22,23,24,25,26,27,28,29>
    private static int RECORDS_REACHING_TARGET = 15;

    protected static URI serverURI;
    protected static MiniSDC miniSDC;
    private static final String TOPIC = "KafkaToHDFSOnCluster";
    private static Producer<String, String> producer;

    private static MiniDFSCluster miniDFS;

    private static final String TEST_NAME = "KafkaToHDFSOnCluster";

    @BeforeClass
    public static void beforeClass() throws Exception {
        //setup kafka to read from
        KafkaTestUtil.startZookeeper();
        KafkaTestUtil.startKafkaBrokers(1);
        KafkaTestUtil.createTopic(TOPIC, 1, 1);
        producer = KafkaTestUtil.createProducer(KafkaTestUtil.getMetadataBrokerURI(), true);
        produceRecords(RECORDS_PRODUCED);

        // setting some dummy kerberos settings to be able to test a mis-setting
        System.setProperty("java.security.krb5.realm", "foo");
        System.setProperty("java.security.krb5.kdc", "localhost:0");

        File minidfsDir = new File("target/minidfs").getAbsoluteFile();
        if (!minidfsDir.exists()) {
            Assert.assertTrue(minidfsDir.mkdirs());
        }
        System.setProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA, minidfsDir.getPath());
        Configuration conf = new HdfsConfiguration();
        conf.set("hadoop.proxyuser." + System.getProperty("user.name") + ".hosts", "*");
        conf.set("hadoop.proxyuser." + System.getProperty("user.name") + ".groups", "*");
        UserGroupInformation.createUserForTesting("foo", new String[] { "all", "supergroup" });
        EditLogFileOutputStream.setShouldSkipFsyncForTesting(true);
        miniDFS = new MiniDFSCluster.Builder(conf).build();

        //setup Cluster and start pipeline
        YarnConfiguration entries = new YarnConfiguration();
        //TODO: Investigate why this is required for test to pass. Is yarn messing with the miniDFS cluster configuration?
        entries.set("hadoop.proxyuser." + System.getProperty("user.name") + ".hosts", "*");
        entries.set("hadoop.proxyuser." + System.getProperty("user.name") + ".groups", "*");
        ClusterUtil.setupCluster(TEST_NAME, getPipelineJson(), entries);
        serverURI = ClusterUtil.getServerURI();
        miniSDC = ClusterUtil.getMiniSDC();
    }

    @AfterClass
    public static void afterClass() throws Exception {
        KafkaTestUtil.shutdown();
        if (miniDFS != null) {
            miniDFS.shutdown();
            miniDFS = null;
        }
        ClusterUtil.tearDownCluster(TEST_NAME);
    }

    private static String getPipelineJson() throws Exception {
        URI uri = Resources.getResource("cluster_kafka_hdfs.json").toURI();
        String pipelineJson = new String(Files.readAllBytes(Paths.get(uri)), StandardCharsets.UTF_8);
        pipelineJson = pipelineJson.replace("topicName", TOPIC);
        pipelineJson = pipelineJson.replaceAll("localhost:9092", KafkaTestUtil.getMetadataBrokerURI());
        pipelineJson = pipelineJson.replaceAll("localhost:2181", KafkaTestUtil.getZkConnect());
        pipelineJson = pipelineJson.replaceAll("/uri", miniDFS.getURI().toString());
        return pipelineJson;
    }

    private static void produceRecords(int records) throws InterruptedException {
        int i = 0;
        while (i < records) {
            producer.send(new KeyedMessage<>(TOPIC, "0", "Hello Kafka" + i));
            i++;
        }
    }

    @Test(timeout = 120000)
    public void testKafkaToHDFSOnCluster() throws Exception {
        List<URI> list = miniSDC.getListOfSlaveSDCURI();
        Assert.assertTrue(list != null && !list.isEmpty());

        Map<String, Map<String, Object>> countersMap = VerifyUtils.getCounters(list, "cluster_kafka_hdfs", "0");
        Assert.assertNotNull(countersMap);
        while (VerifyUtils.getSourceOutputRecords(countersMap) != RECORDS_PRODUCED) {
            LOG.debug("Source output records are not equal to " + RECORDS_PRODUCED + " retrying again");
            Thread.sleep(500);
            countersMap = VerifyUtils.getCounters(list, "cluster_kafka_hdfs", "0");
            Assert.assertNotNull(countersMap);
        }
        while (VerifyUtils.getTargetInputRecords(countersMap) != RECORDS_REACHING_TARGET) {
            LOG.debug("Target Input records are not equal to " + RECORDS_REACHING_TARGET + " retrying again");
            Thread.sleep(500);
            countersMap = VerifyUtils.getCounters(list, "cluster_kafka_hdfs", "0");
            Assert.assertNotNull(countersMap);
        }
        //HDFS configuration is set to roll file after 15 records.
        int recordsRead = 0;
        DistributedFileSystem fileSystem = miniDFS.getFileSystem();
        FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/tmp/out/" + TestUtil.getCurrentYear()));
        for (FileStatus f : fileStatuses) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fileSystem.open(f.getPath())));
            String line = br.readLine();
            while (line != null) {
                Assert.assertTrue(line.contains("Hello Kafka"));
                int j = Integer.parseInt(line.substring(11, 12));
                Assert.assertTrue(j % 2 == 0);
                recordsRead++;
                line = br.readLine();
            }
        }

        Assert.assertEquals(RECORDS_REACHING_TARGET, recordsRead);
    }

}