org.apache.flume.sink.hdfs.TestHDFSEventSinkOnMiniCluster.java Source code

Introduction

Here is the source code for org.apache.flume.sink.hdfs.TestHDFSEventSinkOnMiniCluster.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flume.sink.hdfs;

import com.google.common.base.Charsets;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.flume.Context;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.channel.MemoryChannel;
import org.apache.flume.event.EventBuilder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Unit tests that exercise HDFSEventSink on an actual instance of HDFS.
 * TODO: figure out how to unit-test Kerberos-secured HDFS.
 */
public class TestHDFSEventSinkOnMiniCluster {

    private static final Logger logger = LoggerFactory.getLogger(TestHDFSEventSinkOnMiniCluster.class);

    private static final boolean KEEP_DATA = false;
    private static final String DFS_DIR = "target/test/dfs";
    private static final String TEST_BUILD_DATA_KEY = "test.build.data";

    private static MiniDFSCluster cluster = null;
    private static String oldTestBuildDataProp = null;

    @BeforeClass
    public static void setupClass() throws IOException {
        // set up data dir for HDFS
        File dfsDir = new File(DFS_DIR);
        if (!dfsDir.isDirectory()) {
            dfsDir.mkdirs();
        }
        // save off system prop to restore later
        oldTestBuildDataProp = System.getProperty(TEST_BUILD_DATA_KEY);
        System.setProperty(TEST_BUILD_DATA_KEY, DFS_DIR);
    }

    private static String getNameNodeURL(MiniDFSCluster cluster) {
        int nnPort = cluster.getNameNode().getNameNodeAddress().getPort();
        return "hdfs://localhost:" + nnPort;
    }

    /**
     * This is a very basic test that writes one event to HDFS and reads it back.
     */
    @Test
    public void simpleHDFSTest() throws EventDeliveryException, IOException {
        cluster = new MiniDFSCluster(new Configuration(), 1, true, null);
        cluster.waitActive();

        String outputDir = "/flume/simpleHDFSTest";
        Path outputDirPath = new Path(outputDir);

        logger.info("Running test with output dir: {}", outputDir);

        FileSystem fs = cluster.getFileSystem();
        // ensure output directory is empty
        if (fs.exists(outputDirPath)) {
            fs.delete(outputDirPath, true);
        }

        String nnURL = getNameNodeURL(cluster);
        logger.info("Namenode address: {}", nnURL);

        Context chanCtx = new Context();
        MemoryChannel channel = new MemoryChannel();
        channel.setName("simpleHDFSTest-mem-chan");
        channel.configure(chanCtx);
        channel.start();

        Context sinkCtx = new Context();
        sinkCtx.put("hdfs.path", nnURL + outputDir);
        sinkCtx.put("hdfs.fileType", HDFSWriterFactory.DataStreamType);
        sinkCtx.put("hdfs.batchSize", Integer.toString(1));

        HDFSEventSink sink = new HDFSEventSink();
        sink.setName("simpleHDFSTest-hdfs-sink");
        sink.configure(sinkCtx);
        sink.setChannel(channel);
        sink.start();

        // create an event
        String EVENT_BODY = "yarg!";
        channel.getTransaction().begin();
        try {
            channel.put(EventBuilder.withBody(EVENT_BODY, Charsets.UTF_8));
            channel.getTransaction().commit();
        } finally {
            channel.getTransaction().close();
        }

        // store event to HDFS
        sink.process();

        // shut down flume
        sink.stop();
        channel.stop();

        // verify that it's in HDFS and that its content is what we say it should be
        FileStatus[] statuses = fs.listStatus(outputDirPath);
        Assert.assertNotNull("No files found written to HDFS", statuses);
        Assert.assertEquals("Only one file expected", 1, statuses.length);

        for (FileStatus status : statuses) {
            Path filePath = status.getPath();
            logger.info("Found file on DFS: {}", filePath);
            FSDataInputStream stream = fs.open(filePath);
            BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
            String line = reader.readLine();
            logger.info("First line in file {}: {}", filePath, line);
            Assert.assertEquals(EVENT_BODY, line);
        }

        if (!KEEP_DATA) {
            fs.delete(outputDirPath, true);
        }

        cluster.shutdown();
        cluster = null;
    }

    /**
     * Writes two events in GZIP-compressed serialize.
     */
    @Test
    public void simpleHDFSGZipCompressedTest() throws EventDeliveryException, IOException {
        cluster = new MiniDFSCluster(new Configuration(), 1, true, null);
        cluster.waitActive();

        String outputDir = "/flume/simpleHDFSGZipCompressedTest";
        Path outputDirPath = new Path(outputDir);

        logger.info("Running test with output dir: {}", outputDir);

        FileSystem fs = cluster.getFileSystem();
        // ensure output directory is empty
        if (fs.exists(outputDirPath)) {
            fs.delete(outputDirPath, true);
        }

        String nnURL = getNameNodeURL(cluster);
        logger.info("Namenode address: {}", nnURL);

        Context chanCtx = new Context();
        MemoryChannel channel = new MemoryChannel();
        channel.setName("simpleHDFSTest-mem-chan");
        channel.configure(chanCtx);
        channel.start();

        Context sinkCtx = new Context();
        sinkCtx.put("hdfs.path", nnURL + outputDir);
        sinkCtx.put("hdfs.fileType", HDFSWriterFactory.CompStreamType);
        sinkCtx.put("hdfs.batchSize", Integer.toString(1));
        sinkCtx.put("hdfs.codeC", "gzip");

        HDFSEventSink sink = new HDFSEventSink();
        sink.setName("simpleHDFSTest-hdfs-sink");
        sink.configure(sinkCtx);
        sink.setChannel(channel);
        sink.start();

        // create an event
        String EVENT_BODY_1 = "yarg1";
        String EVENT_BODY_2 = "yarg2";
        channel.getTransaction().begin();
        try {
            channel.put(EventBuilder.withBody(EVENT_BODY_1, Charsets.UTF_8));
            channel.put(EventBuilder.withBody(EVENT_BODY_2, Charsets.UTF_8));
            channel.getTransaction().commit();
        } finally {
            channel.getTransaction().close();
        }

        // store event to HDFS
        sink.process();

        // shut down flume
        sink.stop();
        channel.stop();

        // verify that it's in HDFS and that its content is what we say it should be
        FileStatus[] statuses = fs.listStatus(outputDirPath);
        Assert.assertNotNull("No files found written to HDFS", statuses);
        Assert.assertEquals("Only one file expected", 1, statuses.length);

        for (FileStatus status : statuses) {
            Path filePath = status.getPath();
            logger.info("Found file on DFS: {}", filePath);
            FSDataInputStream stream = fs.open(filePath);
            BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(stream)));
            String line = reader.readLine();
            logger.info("First line in file {}: {}", filePath, line);
            Assert.assertEquals(EVENT_BODY_1, line);

            // The rest of this test is commented-out (will fail) for 2 reasons:
            //
            // (1) At the time of this writing, Hadoop has a bug which causes the
            // non-native gzip implementation to create invalid gzip files when
            // finish() and resetState() are called. See HADOOP-8522.
            //
            // (2) Even if HADOOP-8522 is fixed, the JDK GZipInputStream is unable
            // to read multi-member (concatenated) gzip files. See this Sun bug:
            // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4691425
            //
            //line = reader.readLine();
            //logger.info("Second line in file {}: {}", filePath, line);
            //Assert.assertEquals(EVENT_BODY_2, line);
        }

        if (!KEEP_DATA) {
            fs.delete(outputDirPath, true);
        }

        cluster.shutdown();
        cluster = null;
    }

    /**
     * This is a very basic test that writes one event to HDFS and reads it back.
     */
    @Test
    public void underReplicationTest() throws EventDeliveryException, IOException {
        Configuration conf = new Configuration();
        conf.set("dfs.replication", String.valueOf(3));
        cluster = new MiniDFSCluster(conf, 3, true, null);
        cluster.waitActive();

        String outputDir = "/flume/underReplicationTest";
        Path outputDirPath = new Path(outputDir);

        logger.info("Running test with output dir: {}", outputDir);

        FileSystem fs = cluster.getFileSystem();
        // ensure output directory is empty
        if (fs.exists(outputDirPath)) {
            fs.delete(outputDirPath, true);
        }

        String nnURL = getNameNodeURL(cluster);
        logger.info("Namenode address: {}", nnURL);

        Context chanCtx = new Context();
        MemoryChannel channel = new MemoryChannel();
        channel.setName("simpleHDFSTest-mem-chan");
        channel.configure(chanCtx);
        channel.start();

        Context sinkCtx = new Context();
        sinkCtx.put("hdfs.path", nnURL + outputDir);
        sinkCtx.put("hdfs.fileType", HDFSWriterFactory.DataStreamType);
        sinkCtx.put("hdfs.batchSize", Integer.toString(1));

        HDFSEventSink sink = new HDFSEventSink();
        sink.setName("simpleHDFSTest-hdfs-sink");
        sink.configure(sinkCtx);
        sink.setChannel(channel);
        sink.start();

        // create an event
        channel.getTransaction().begin();
        try {
            channel.put(EventBuilder.withBody("yarg 1", Charsets.UTF_8));
            channel.put(EventBuilder.withBody("yarg 2", Charsets.UTF_8));
            channel.put(EventBuilder.withBody("yarg 3", Charsets.UTF_8));
            channel.put(EventBuilder.withBody("yarg 4", Charsets.UTF_8));
            channel.put(EventBuilder.withBody("yarg 5", Charsets.UTF_8));
            channel.put(EventBuilder.withBody("yarg 5", Charsets.UTF_8));
            channel.getTransaction().commit();
        } finally {
            channel.getTransaction().close();
        }

        // store events to HDFS
        logger.info("Running process(). Create new file.");
        sink.process(); // create new file;
        logger.info("Running process(). Same file.");
        sink.process();

        // kill a datanode
        logger.info("Killing datanode #1...");
        cluster.stopDataNode(0);

        // there is a race here.. the client may or may not notice that the
        // datanode is dead before it next sync()s.
        // so, this next call may or may not roll a new file.

        logger.info("Running process(). Create new file? (racy)");
        sink.process();

        logger.info("Running process(). Create new file.");
        sink.process();

        logger.info("Running process(). Create new file.");
        sink.process();

        logger.info("Running process(). Create new file.");
        sink.process();

        // shut down flume
        sink.stop();
        channel.stop();

        // verify that it's in HDFS and that its content is what we say it should be
        FileStatus[] statuses = fs.listStatus(outputDirPath);
        Assert.assertNotNull("No files found written to HDFS", statuses);

        for (FileStatus status : statuses) {
            Path filePath = status.getPath();
            logger.info("Found file on DFS: {}", filePath);
            FSDataInputStream stream = fs.open(filePath);
            BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
            String line = reader.readLine();
            logger.info("First line in file {}: {}", filePath, line);
            Assert.assertTrue(line.startsWith("yarg"));
        }

        Assert.assertTrue("4 or 5 files expected, found " + statuses.length,
                statuses.length == 4 || statuses.length == 5);
        System.out.println("There are " + statuses.length + " files.");

        if (!KEEP_DATA) {
            fs.delete(outputDirPath, true);
        }

        cluster.shutdown();
        cluster = null;
    }

    /**
     * This is a very basic test that writes one event to HDFS and reads it back.
     */
    @Ignore("This test is flakey and causes tests to fail pretty often.")
    @Test
    public void maxUnderReplicationTest() throws EventDeliveryException, IOException {
        Configuration conf = new Configuration();
        conf.set("dfs.replication", String.valueOf(3));
        cluster = new MiniDFSCluster(conf, 3, true, null);
        cluster.waitActive();

        String outputDir = "/flume/underReplicationTest";
        Path outputDirPath = new Path(outputDir);

        logger.info("Running test with output dir: {}", outputDir);

        FileSystem fs = cluster.getFileSystem();
        // ensure output directory is empty
        if (fs.exists(outputDirPath)) {
            fs.delete(outputDirPath, true);
        }

        String nnURL = getNameNodeURL(cluster);
        logger.info("Namenode address: {}", nnURL);

        Context chanCtx = new Context();
        MemoryChannel channel = new MemoryChannel();
        channel.setName("simpleHDFSTest-mem-chan");
        channel.configure(chanCtx);
        channel.start();

        Context sinkCtx = new Context();
        sinkCtx.put("hdfs.path", nnURL + outputDir);
        sinkCtx.put("hdfs.fileType", HDFSWriterFactory.DataStreamType);
        sinkCtx.put("hdfs.batchSize", Integer.toString(1));

        HDFSEventSink sink = new HDFSEventSink();
        sink.setName("simpleHDFSTest-hdfs-sink");
        sink.configure(sinkCtx);
        sink.setChannel(channel);
        sink.start();

        // create an event
        channel.getTransaction().begin();
        try {
            for (int i = 0; i < 50; i++) {
                channel.put(EventBuilder.withBody("yarg " + i, Charsets.UTF_8));
            }
            channel.getTransaction().commit();
        } finally {
            channel.getTransaction().close();
        }

        // store events to HDFS
        logger.info("Running process(). Create new file.");
        sink.process(); // create new file;
        logger.info("Running process(). Same file.");
        sink.process();

        // kill a datanode
        logger.info("Killing datanode #1...");
        cluster.stopDataNode(0);

        // there is a race here.. the client may or may not notice that the
        // datanode is dead before it next sync()s.
        // so, this next call may or may not roll a new file.

        logger.info("Running process(). Create new file? (racy)");
        sink.process();

        for (int i = 3; i < 50; i++) {
            logger.info("Running process().");
            sink.process();
        }

        // shut down flume
        sink.stop();
        channel.stop();

        // verify that it's in HDFS and that its content is what we say it should be
        FileStatus[] statuses = fs.listStatus(outputDirPath);
        Assert.assertNotNull("No files found written to HDFS", statuses);

        for (FileStatus status : statuses) {
            Path filePath = status.getPath();
            logger.info("Found file on DFS: {}", filePath);
            FSDataInputStream stream = fs.open(filePath);
            BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
            String line = reader.readLine();
            logger.info("First line in file {}: {}", filePath, line);
            Assert.assertTrue(line.startsWith("yarg"));
        }

        System.out.println("There are " + statuses.length + " files.");
        Assert.assertEquals("31 files expected, found " + statuses.length, 31, statuses.length);

        if (!KEEP_DATA) {
            fs.delete(outputDirPath, true);
        }

        cluster.shutdown();
        cluster = null;
    }

    @AfterClass
    public static void teardownClass() {
        // restore system state, if needed
        if (oldTestBuildDataProp != null) {
            System.setProperty(TEST_BUILD_DATA_KEY, oldTestBuildDataProp);
        }

        if (!KEEP_DATA) {
            FileUtils.deleteQuietly(new File(DFS_DIR));
        }
    }

}