com.cloudera.flume.handlers.hdfs.CustomDfsSink.java Source code

Introduction

Here is the source code for com.cloudera.flume.handlers.hdfs.CustomDfsSink.java
Source

/**
 * Licensed to Cloudera, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.flume.handlers.hdfs;

import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Compressor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.FlumeBuilder;
import com.cloudera.flume.conf.FlumeConfiguration;
import com.cloudera.flume.conf.FlumeSpecException;
import com.cloudera.flume.conf.SinkFactory.SinkBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.handlers.text.FormatFactory;
import com.cloudera.flume.handlers.text.output.OutputFormat;
import com.cloudera.flume.reporter.ReportEvent;
import com.cloudera.util.PathManager;
import com.google.common.base.Preconditions;

/**
 * This creates a raw hadoop dfs file that outputs data formatted by the
 * provided OutputFormat. It is assumed that the output is a file of some sort.
 */
public class CustomDfsSink extends EventSink.Base {
    static final Logger LOG = LoggerFactory.getLogger(CustomDfsSink.class);

    private static final String A_OUTPUTFORMAT = "recordformat";

    boolean compressOutput;
    OutputFormat format;
    OutputStream writer;
    AtomicLong count = new AtomicLong();
    String path;
    Path dstPath;
    PathManager pathManager;

    public CustomDfsSink(String path, OutputFormat format) {
        Preconditions.checkArgument(path != null);
        Preconditions.checkArgument(format != null);
        this.path = path;
        this.format = format;
        this.writer = null;
    }

    @Override
    public void append(Event e) throws IOException, InterruptedException {
        if (writer == null) {
            throw new IOException("Append failed, did you open the writer?");
        }

        format.format(writer, e);
        count.getAndIncrement();
        super.append(e);
    }

    @Override
    public void close() throws IOException {
        LOG.info("Closing HDFS file: " + pathManager.getOpenPath());
        writer.flush();
        LOG.info("done writing raw file to hdfs");
        writer.close();
        pathManager.close();
        writer = null;
    }

    /**
     * Hadoop Compression Codecs that use Native libs require an instance of a
     * Configuration Object. They require this due to some check against knowing
     * weather or not the native libs have been loaded. GzipCodec, LzoCodec,
     * LzopCodec are all codecs that require Native libs. GZipCodec has a slight
     * exception that if native libs are not accessible it will use Pure Java.
     * This results in no errors just notices. BZip2Codec is an example codec that
     * doesn't use native libs.
     */
    @Override
    public void open() throws IOException {
        FlumeConfiguration conf = FlumeConfiguration.get();
        FileSystem hdfs;
        String codecName = conf.getCollectorDfsCompressCodec();
        CompressionCodec codec = getCodec(conf, codecName);

        if (codec == null) {
            dstPath = new Path(path);
            hdfs = dstPath.getFileSystem(conf);
            pathManager = new PathManager(hdfs, dstPath.getParent(), dstPath.getName());
            writer = pathManager.open();
            LOG.info("Creating HDFS file: " + pathManager.getOpenPath());
            return;
        }

        Compressor cmp = codec.createCompressor();
        dstPath = new Path(path + codec.getDefaultExtension());
        hdfs = dstPath.getFileSystem(conf);
        pathManager = new PathManager(hdfs, dstPath.getParent(), dstPath.getName());
        writer = pathManager.open();
        try {
            writer = codec.createOutputStream(writer, cmp);
        } catch (NullPointerException npe) {
            // tries to find "native" version of codec, if that fails, then tries to
            // find java version. If there is no java version, the createOutputStream
            // exits via NPE. We capture this and convert it into a IOE with a more
            // useful error message.
            LOG.error("Unable to load compression codec " + codec);
            throw new IOException("Unable to load compression codec " + codec);
        }
        LOG.info("Creating " + codec + " compressed HDFS file: " + pathManager.getOpenPath());
    }

    private static boolean codecMatches(Class<? extends CompressionCodec> cls, String codecName) {
        String simpleName = cls.getSimpleName();
        if (cls.getName().equals(codecName) || simpleName.equalsIgnoreCase(codecName)) {
            return true;
        }
        if (simpleName.endsWith("Codec")) {
            String prefix = simpleName.substring(0, simpleName.length() - "Codec".length());
            if (prefix.equalsIgnoreCase(codecName)) {
                return true;
            }
        }
        return false;
    }

    public static CompressionCodec getCodec(Configuration conf, String codecName) {
        List<Class<? extends CompressionCodec>> codecs = CompressionCodecFactory
                .getCodecClasses(FlumeConfiguration.get());
        // Wish we could base this on DefaultCodec but appears not all codec's
        // extend DefaultCodec(Lzo)
        CompressionCodec codec = null;
        ArrayList<String> codecStrs = new ArrayList<String>();
        codecStrs.add("None");
        for (Class<? extends CompressionCodec> cls : codecs) {
            codecStrs.add(cls.getSimpleName());

            if (codecMatches(cls, codecName)) {
                try {
                    codec = cls.newInstance();
                } catch (InstantiationException e) {
                    LOG.error("Unable to instantiate " + cls + " class");
                } catch (IllegalAccessException e) {
                    LOG.error("Unable to access " + cls + " class");
                }
            }
        }

        if (codec == null) {
            if (!codecName.equalsIgnoreCase("None")) {
                throw new IllegalArgumentException(
                        "Unsupported compression codec " + codecName + ".  Please choose from: " + codecStrs);
            }
        } else if (codec instanceof Configurable) {
            // Must check instanceof codec as BZip2Codec doesn't inherit Configurable
            // Must set the configuration for Configurable objects that may or do use
            // native libs
            ((Configurable) codec).setConf(conf);
        }
        return codec;
    }

    public static SinkBuilder builder() {
        return new SinkBuilder() {
            @Override
            public EventSink create(Context context, Object... args) {
                if (args.length != 2 && args.length != 1) {
                    // TODO (jon) make this message easier.
                    throw new IllegalArgumentException(
                            "usage: customdfs(\"[(hdfs|file|s3n|...)://namenode[:port]]/path\", format)");
                }

                Object format = (args.length == 1) ? null : args[1];
                OutputFormat fmt;
                try {
                    fmt = FlumeBuilder.createFormat(FormatFactory.get(), format);
                } catch (FlumeSpecException e) {
                    LOG.error("failed to load format " + format, e);
                    throw new IllegalArgumentException("failed to load format " + format);
                }
                return new CustomDfsSink(args[0].toString(), fmt);
            }

            @Deprecated
            @Override
            public EventSink build(Context context, String... args) {
                // updated interface calls build(Context,Object...) instead
                throw new RuntimeException("Old sink builder for CustomDfsSink should not be exercised");
            }
        };
    }

    @Override
    public String getName() {
        return "CustomDfs";
    }

    @Override
    public ReportEvent getMetrics() {
        ReportEvent rpt = super.getMetrics();
        rpt.setStringMetric(A_OUTPUTFORMAT, format.getBuilder().getName());
        rpt.setLongMetric(ReportEvent.A_COUNT, count.get());
        return rpt;
    }

    @Deprecated
    @Override
    public ReportEvent getReport() {
        ReportEvent rpt = super.getReport();
        rpt.setStringMetric(A_OUTPUTFORMAT, format.getBuilder().getName());
        rpt.setLongMetric(ReportEvent.A_COUNT, count.get());
        return rpt;
    }
}