com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHDFSSourceIT.java Source code

Introduction

Here is the source code for com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHDFSSourceIT.java
Source

/**
 * Copyright 2015 StreamSets Inc.
 *
 * Licensed under the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.streamsets.pipeline.stage.origin.hdfs.cluster;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;

import com.streamsets.pipeline.config.CsvRecordType;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;

import com.streamsets.pipeline.api.Field;

import org.apache.avro.io.DatumWriter;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.ImmutableList;
import com.streamsets.pipeline.impl.Pair;
import com.streamsets.pipeline.api.ExecutionMode;
import com.streamsets.pipeline.api.OnRecordError;
import com.streamsets.pipeline.api.Record;
import com.streamsets.pipeline.api.Stage.ConfigIssue;
import com.streamsets.pipeline.api.StageException;
import com.streamsets.pipeline.config.CsvHeader;
import com.streamsets.pipeline.config.CsvMode;
import com.streamsets.pipeline.config.DataFormat;
import com.streamsets.pipeline.sdk.ContextInfoCreator;
import com.streamsets.pipeline.sdk.SourceRunner;
import com.streamsets.pipeline.sdk.StageRunner;

public class ClusterHDFSSourceIT {
    private static final Logger LOG = LoggerFactory.getLogger(ClusterHdfsSource.class);
    private static MiniDFSCluster miniDFS;
    private static Path dir;
    private static File dummyEtc;
    private static String resourcesDir;
    private static String hadoopConfDir;
    private static File minidfsDir;

    @BeforeClass
    public static void setUpBeforeClass() throws Exception {
        minidfsDir = new File("target/minidfs-" + UUID.randomUUID()).getAbsoluteFile();
        minidfsDir.mkdirs();
        Assert.assertTrue(minidfsDir.exists());
        System.setProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA, minidfsDir.getPath());
        Configuration conf = new HdfsConfiguration();
        conf.set("dfs.namenode.fs-limits.min-block-size", String.valueOf(32));
        EditLogFileOutputStream.setShouldSkipFsyncForTesting(true);
        miniDFS = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
        dir = new Path(miniDFS.getURI() + "/dir");
        FileSystem fs = miniDFS.getFileSystem();
        fs.mkdirs(dir);
        writeFile(fs, new Path(dir + "/forAllTests/" + "path"), 1000);
        dummyEtc = new File(minidfsDir, "dummy-etc");
        dummyEtc.mkdirs();
        Assert.assertTrue(dummyEtc.exists());
        Configuration dummyConf = new Configuration(false);
        for (String file : new String[] { "core", "hdfs", "mapred", "yarn" }) {
            File siteXml = new File(dummyEtc, file + "-site.xml");
            FileOutputStream out = new FileOutputStream(siteXml);
            dummyConf.writeXml(out);
            out.close();
        }
        resourcesDir = minidfsDir.getAbsolutePath();
        hadoopConfDir = dummyEtc.getName();
        System.setProperty("sdc.resources.dir", resourcesDir);
        ;
    }

    @AfterClass
    public static void cleanUpClass() throws IOException {
        System.clearProperty("sdc.resources.dir");
        if (miniDFS != null) {
            miniDFS.shutdown();
            miniDFS = null;
        }
    }

    private ClusterHdfsSource createSource(ClusterHdfsConfigBean conf) {
        return new ClusterHdfsSource(conf);
    }

    @Test
    public void testConfigsAbsent() throws Exception {
        File dummyEtcConfigsAbsent = new File(minidfsDir, "dummyEtcConfigsAbsent");
        dummyEtcConfigsAbsent.mkdirs();
        try {
            // Write only config file
            writeConfig(dummyEtcConfigsAbsent, "core");
            ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
            conf.hdfsUri = miniDFS.getURI().toString();
            conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
            conf.hdfsConfigs = new HashMap<>();
            conf.hdfsConfDir = dummyEtcConfigsAbsent.getName();
            conf.dataFormat = DataFormat.TEXT;
            conf.dataFormatConfig.textMaxLineLen = 1024;
            SourceRunner sourceRunner = new SourceRunner.Builder(ClusterHdfsDSource.class, createSource(conf))
                    .addOutputLane("lane").setExecutionMode(ExecutionMode.CLUSTER_BATCH)
                    .setResourcesDir(resourcesDir).build();

            verifyForTestConfigsAbsent(sourceRunner, 3);

            // Write second config file
            writeConfig(dummyEtcConfigsAbsent, "mapred");
            verifyForTestConfigsAbsent(sourceRunner, 2);

            // Write third config file
            writeConfig(dummyEtcConfigsAbsent, "hdfs");
            verifyForTestConfigsAbsent(sourceRunner, 1);

            // Write the 4th; now all config files are present so init shouldn't throw exception
            writeConfig(dummyEtcConfigsAbsent, "yarn");
            sourceRunner.runInit();
            sourceRunner.runDestroy();

        } finally {
            FileUtils.deleteQuietly(dummyEtcConfigsAbsent);
        }
    }

    private void verifyForTestConfigsAbsent(SourceRunner sourceRunner, int issueCount) throws StageException {
        List<ConfigIssue> issues = sourceRunner.runValidateConfigs();
        assertEquals(String.valueOf(issues), issueCount, issues.size());
        assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_30"));
    }

    private void writeConfig(File configDir, String configFileNamePrefix) throws IOException {
        Configuration dummyConf = new Configuration(false);
        File siteXml = new File(configDir, configFileNamePrefix + "-site.xml");
        FileOutputStream out = new FileOutputStream(siteXml);
        dummyConf.writeXml(out);
        out.close();
    }

    @Test
    public void testWrongHDFSDirLocation() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsConfigs.put("x", "X");
        conf.dataFormat = DataFormat.TEXT;
        conf.dataFormatConfig.textMaxLineLen = 1024;

        conf.hdfsUri = "/pathwithnoschemeorauthority";
        ClusterHdfsSource clusterHdfsSource = createSource(conf);
        try {
            List<ConfigIssue> issues = clusterHdfsSource.init(null, ContextInfoCreator
                    .createSourceContext("myInstance", false, OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_02"));

            conf.hdfsUri = "file://localhost:8020/";
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_12"));

            conf.hdfsUri = "hdfs:///noauthority";
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_13"));

            conf.hdfsUri = "hdfs://localhost:50000";
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_11"));

            conf.hdfsUri = miniDFS.getURI().toString();
            conf.hdfsDirLocations = Arrays.asList("/pathdoesnotexist");
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_10"));

            conf.hdfsUri = miniDFS.getURI().toString();
            conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
            FileSystem fs = miniDFS.getFileSystem();
            Path someFile = new Path(new Path(dir.toUri()), "/someFile");
            fs.create(someFile).close();
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 0, issues.size());

            conf.hdfsUri = null;
            conf.hdfsConfigs.put(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, miniDFS.getURI().toString());
            someFile = new Path(new Path(dir.toUri()), "/someFile2");
            fs.create(someFile).close();
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 0, issues.size());

            Path dummyFile = new Path(new Path(dir.toUri()), "/dummyFile");
            fs.create(dummyFile).close();
            conf.hdfsUri = miniDFS.getURI().toString();
            conf.hdfsDirLocations = Arrays.asList(dummyFile.toUri().getPath());
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_15"));

            Path emptyDir = new Path(dir.toUri().getPath(), "emptyDir");
            fs.mkdirs(emptyDir);
            conf.hdfsUri = miniDFS.getURI().toString();
            conf.hdfsDirLocations = Arrays.asList(emptyDir.toUri().getPath());
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 1, issues.size());
            assertTrue(String.valueOf(issues), issues.get(0).toString().contains("HADOOPFS_16"));

            Path path1 = new Path(emptyDir, "path1");
            fs.create(path1).close();
            conf.hdfsUri = miniDFS.getURI().toString();
            conf.hdfsDirLocations = Arrays.asList(emptyDir.toUri().getPath());
            clusterHdfsSource = createSource(conf);
            issues = clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            assertEquals(String.valueOf(issues), 0, issues.size());
        } finally {
            clusterHdfsSource.destroy();
        }
    }

    @Test
    public void testGetHdfsConfiguration() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toString());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsConfigs.put("x", "X");
        conf.dataFormat = DataFormat.TEXT;
        conf.dataFormatConfig.textMaxLineLen = 1024;

        ClusterHdfsSource clusterHdfsSource = createSource(conf);
        try {
            clusterHdfsSource.init(null, ContextInfoCreator.createSourceContext("myInstance", false,
                    OnRecordError.TO_ERROR, ImmutableList.of("lane")));
            Assert.assertNotNull(clusterHdfsSource.getConfiguration());
            assertEquals("X", clusterHdfsSource.getConfiguration().get("x"));
        } finally {
            clusterHdfsSource.destroy();
        }
    }

    @Test(timeout = 30000)
    public void testProduce() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsKerberos = false;
        conf.hdfsConfDir = hadoopConfDir;
        conf.recursive = false;
        conf.produceSingleRecordPerMessage = false;
        conf.dataFormat = DataFormat.TEXT;
        conf.dataFormatConfig.textMaxLineLen = 1024;

        SourceRunner sourceRunner = new SourceRunner.Builder(ClusterHdfsDSource.class, createSource(conf))
                .addOutputLane("lane").setExecutionMode(ExecutionMode.CLUSTER_BATCH).setResourcesDir(resourcesDir)
                .build();

        sourceRunner.runInit();

        List<Map.Entry> list = new ArrayList<>();
        list.add(new Pair(new LongWritable(1), new Text("aaa")));
        list.add(new Pair(new LongWritable(2), new Text("bbb")));
        list.add(new Pair(new LongWritable(3), new Text("ccc")));

        Thread th = createThreadForAddingBatch(sourceRunner, list);
        try {
            StageRunner.Output output = sourceRunner.runProduce(null, 5);

            String newOffset = output.getNewOffset();
            Assert.assertEquals("3", newOffset);
            List<Record> records = output.getRecords().get("lane");
            Assert.assertEquals(3, records.size());

            for (int i = 0; i < records.size(); i++) {
                Assert.assertNotNull(records.get(i).get("/text"));
                LOG.info("Header " + records.get(i).getHeader().getSourceId());
                Assert.assertTrue(!records.get(i).get("/text").getValueAsString().isEmpty());
                Assert.assertEquals(list.get(i).getValue().toString(),
                        records.get(i).get("/text").getValueAsString());
            }

            if (sourceRunner != null) {
                sourceRunner.runDestroy();
            }
        } finally {
            th.interrupt();
        }
    }

    @Test(timeout = 30000)
    public void testProduceDelimitedNoHeader() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsKerberos = false;
        conf.hdfsConfDir = hadoopConfDir;
        conf.recursive = false;
        conf.produceSingleRecordPerMessage = false;
        conf.dataFormat = DataFormat.DELIMITED;
        conf.dataFormatConfig.csvFileFormat = CsvMode.CSV;
        conf.dataFormatConfig.csvHeader = CsvHeader.NO_HEADER;
        conf.dataFormatConfig.csvMaxObjectLen = 4096;
        conf.dataFormatConfig.csvRecordType = CsvRecordType.LIST;
        conf.dataFormatConfig.csvSkipStartLines = 0;

        SourceRunner sourceRunner = new SourceRunner.Builder(ClusterHdfsDSource.class, createSource(conf))
                .addOutputLane("lane").setExecutionMode(ExecutionMode.CLUSTER_BATCH).setResourcesDir(resourcesDir)
                .build();

        sourceRunner.runInit();

        List<Map.Entry> list = new ArrayList<>();
        list.add(new Pair("1", new String("A,B\na,b")));
        list.add(new Pair("2", new String("C,D\nc,d")));

        Thread th = createThreadForAddingBatch(sourceRunner, list);
        try {
            StageRunner.Output output = sourceRunner.runProduce(null, 5);

            String newOffset = output.getNewOffset();
            Assert.assertEquals("2", newOffset);
            List<Record> records = output.getRecords().get("lane");
            Assert.assertEquals(4, records.size());
            Record record = records.get(0);
            Assert.assertEquals("A",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[0]/header"));
            Assert.assertEquals("B",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[1]/header"));
            record = records.get(1);
            Assert.assertEquals("a",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[0]/header"));
            Assert.assertEquals("b",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[1]/header"));
            record = records.get(2);
            Assert.assertEquals("C",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[0]/header"));
            Assert.assertEquals("D",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[1]/header"));
            record = records.get(3);
            Assert.assertEquals("c",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[0]/header"));
            Assert.assertEquals("d",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[1]/header"));

            if (sourceRunner != null) {
                sourceRunner.runDestroy();
            }
        } finally {
            th.interrupt();
        }
    }

    @Test(timeout = 30000)
    public void testProduceDelimitedIgnoreHeader() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsKerberos = false;
        conf.hdfsConfDir = hadoopConfDir;
        conf.recursive = false;
        conf.produceSingleRecordPerMessage = false;
        conf.dataFormat = DataFormat.DELIMITED;
        conf.dataFormatConfig.csvFileFormat = CsvMode.CSV;
        conf.dataFormatConfig.csvHeader = CsvHeader.IGNORE_HEADER;
        conf.dataFormatConfig.csvMaxObjectLen = 4096;
        conf.dataFormatConfig.csvRecordType = CsvRecordType.LIST;
        conf.dataFormatConfig.csvSkipStartLines = 0;

        SourceRunner sourceRunner = new SourceRunner.Builder(ClusterHdfsDSource.class, createSource(conf))
                .addOutputLane("lane").setExecutionMode(ExecutionMode.CLUSTER_BATCH).setResourcesDir(resourcesDir)
                .build();

        sourceRunner.runInit();

        List<Map.Entry> list = new ArrayList<>();
        list.add(new Pair("path::0::0", new String("A,B\na,b")));
        list.add(new Pair("path::1::1", new String("C,D\nc,d")));

        Thread th = createThreadForAddingBatch(sourceRunner, list);
        try {
            StageRunner.Output output = sourceRunner.runProduce(null, 5);

            String newOffset = output.getNewOffset();
            Assert.assertEquals("path::1::1", newOffset);
            List<Record> records = output.getRecords().get("lane");
            Assert.assertEquals(2, records.size());
            Record record = records.get(0);
            Assert.assertEquals("C",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[0]/header"));
            Assert.assertEquals("D",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[1]/header"));
            record = records.get(1);
            Assert.assertEquals("c",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[0]/header"));
            Assert.assertEquals("d",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertFalse(record.has("[1]/header"));

            if (sourceRunner != null) {
                sourceRunner.runDestroy();
            }
        } finally {
            th.interrupt();
        }
    }

    @Test(timeout = 30000)
    public void testProduceDelimitedWithHeader() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsKerberos = false;
        conf.hdfsConfDir = hadoopConfDir;
        conf.recursive = false;
        conf.produceSingleRecordPerMessage = false;
        conf.dataFormat = DataFormat.DELIMITED;
        conf.dataFormatConfig.csvFileFormat = CsvMode.CSV;
        conf.dataFormatConfig.csvHeader = CsvHeader.WITH_HEADER;
        conf.dataFormatConfig.csvMaxObjectLen = 4096;
        conf.dataFormatConfig.csvRecordType = CsvRecordType.LIST;
        conf.dataFormatConfig.csvSkipStartLines = 0;

        SourceRunner sourceRunner = new SourceRunner.Builder(ClusterHdfsDSource.class, createSource(conf))
                .addOutputLane("lane").setExecutionMode(ExecutionMode.CLUSTER_BATCH).setResourcesDir(resourcesDir)
                .build();

        sourceRunner.runInit();

        List<Map.Entry> list = new ArrayList<>();
        list.add(new Pair("HEADER_COL_1,HEADER_COL_2", null));
        list.add(new Pair("path::" + "1", new String("a,b\nC,D\nc,d")));

        Thread th = createThreadForAddingBatch(sourceRunner, list);
        try {
            StageRunner.Output output = sourceRunner.runProduce(null, 5);

            String newOffset = output.getNewOffset();
            Assert.assertEquals("path::" + "1", newOffset);
            List<Record> records = output.getRecords().get("lane");
            Assert.assertEquals(3, records.size());
            Record record = records.get(0);
            Assert.assertEquals("a",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertEquals("HEADER_COL_1",
                    record.get().getValueAsList().get(0).getValueAsMap().get("header").getValueAsString());
            Assert.assertEquals("b",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertEquals("HEADER_COL_2",
                    record.get().getValueAsList().get(1).getValueAsMap().get("header").getValueAsString());
            record = records.get(1);
            Assert.assertEquals("C",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertEquals("HEADER_COL_1",
                    record.get().getValueAsList().get(0).getValueAsMap().get("header").getValueAsString());
            Assert.assertEquals("D",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertEquals("HEADER_COL_2",
                    record.get().getValueAsList().get(1).getValueAsMap().get("header").getValueAsString());
            record = records.get(2);
            Assert.assertEquals("c",
                    record.get().getValueAsList().get(0).getValueAsMap().get("value").getValueAsString());
            Assert.assertEquals("HEADER_COL_1",
                    record.get().getValueAsList().get(0).getValueAsMap().get("header").getValueAsString());
            Assert.assertEquals("d",
                    record.get().getValueAsList().get(1).getValueAsMap().get("value").getValueAsString());
            Assert.assertEquals("HEADER_COL_2",
                    record.get().getValueAsList().get(1).getValueAsMap().get("header").getValueAsString());
            if (sourceRunner != null) {
                sourceRunner.runDestroy();
            }
        } finally {
            th.interrupt();
        }
    }

    @Test(timeout = 30000)
    public void testProduceAvroData() throws Exception {
        ClusterHdfsConfigBean conf = new ClusterHdfsConfigBean();
        conf.hdfsUri = miniDFS.getURI().toString();
        conf.hdfsDirLocations = Arrays.asList(dir.toUri().getPath());
        conf.hdfsConfigs = new HashMap<>();
        conf.hdfsKerberos = false;
        conf.hdfsConfDir = hadoopConfDir;
        conf.recursive = false;
        conf.produceSingleRecordPerMessage = false;
        conf.dataFormat = DataFormat.AVRO;

        SourceRunner sourceRunner = new SourceRunner.Builder(ClusterHdfsDSource.class, createSource(conf))
                .addOutputLane("lane").setExecutionMode(ExecutionMode.CLUSTER_BATCH).setResourcesDir(resourcesDir)
                .build();

        sourceRunner.runInit();

        List<Map.Entry> list = new ArrayList<>();
        list.add(new Pair("path::" + "1" + "::1",
                createAvroData("a", 30, ImmutableList.of("a@company.com", "a2@company.com"))));
        list.add(new Pair("path::" + "1" + "::2",
                createAvroData("b", 40, ImmutableList.of("b@company.com", "b2@company.com"))));

        Thread th = createThreadForAddingBatch(sourceRunner, list);
        try {
            StageRunner.Output output = sourceRunner.runProduce(null, 5);
            String newOffset = output.getNewOffset();
            Assert.assertEquals("path::" + "1::2", newOffset);
            List<Record> records = output.getRecords().get("lane");
            Assert.assertEquals(2, records.size());

            Record record = records.get(0);
            Assert.assertTrue(record.has("/name"));
            Assert.assertEquals("a", record.get("/name").getValueAsString());
            Assert.assertTrue(record.has("/age"));
            Assert.assertEquals(30, record.get("/age").getValueAsInteger());
            Assert.assertTrue(record.has("/emails"));
            Assert.assertTrue(record.get("/emails").getValueAsList() instanceof List);
            List<Field> emails = record.get("/emails").getValueAsList();
            Assert.assertEquals(2, emails.size());
            Assert.assertEquals("a@company.com", emails.get(0).getValueAsString());
            Assert.assertEquals("a2@company.com", emails.get(1).getValueAsString());

            record = records.get(1);
            Assert.assertTrue(record.has("/name"));
            Assert.assertEquals("b", record.get("/name").getValueAsString());
            Assert.assertTrue(record.has("/age"));
            Assert.assertEquals(40, record.get("/age").getValueAsInteger());
            Assert.assertTrue(record.has("/emails"));
            Assert.assertTrue(record.get("/emails").getValueAsList() instanceof List);
            emails = record.get("/emails").getValueAsList();
            Assert.assertEquals(2, emails.size());
            Assert.assertEquals("b@company.com", emails.get(0).getValueAsString());
            Assert.assertEquals("b2@company.com", emails.get(1).getValueAsString());

        } finally {
            th.interrupt();
        }
    }

    private byte[] createAvroData(String name, int age, List<String> emails) throws IOException {
        String AVRO_SCHEMA = "{\n" + "\"type\": \"record\",\n" + "\"name\": \"Employee\",\n" + "\"fields\": [\n"
                + " {\"name\": \"name\", \"type\": \"string\"},\n" + " {\"name\": \"age\", \"type\": \"int\"},\n"
                + " {\"name\": \"emails\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},\n"
                + " {\"name\": \"boss\", \"type\": [\"Employee\",\"null\"]}\n" + "]}";
        Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        GenericRecord e1 = new GenericData.Record(schema);
        e1.put("name", name);
        e1.put("age", age);
        e1.put("emails", emails);
        e1.put("boss", null);

        DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
        dataFileWriter.create(schema, out);
        dataFileWriter.append(e1);
        dataFileWriter.close();
        return out.toByteArray();
    }

    private Thread createThreadForAddingBatch(final SourceRunner sourceRunner, final List<Map.Entry> list) {
        Thread sourceThread = new Thread() {
            @Override
            public void run() {
                try {
                    ClusterHdfsSource source = (ClusterHdfsSource) sourceRunner.getStage();
                    source.put(list);
                } catch (Exception ex) {
                    LOG.error("Error in waiter thread: " + ex, ex);
                }
            }
        };
        sourceThread.setName(getClass().getName() + "-sourceThread");
        sourceThread.setDaemon(true);
        sourceThread.start();
        return sourceThread;
    }

    private static void writeFile(FileSystem fs, Path ph, int size) throws IOException {
        FSDataOutputStream stm = fs.create(ph, true, 4096, (short) 3, 512);
        for (int i = 0; i < 1; i++) {
            stm.write(new byte[size]);
        }
        stm.hsync();
        stm.hsync();
        stm.close();
    }

}