co.cask.hydrator.plugin.HDFSSink.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.hydrator.plugin.HDFSSink.java

Source

/*
 * Copyright  2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.batch.OutputFormatProvider;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.hydrator.common.ReferenceBatchSink;
import co.cask.hydrator.common.ReferencePluginConfig;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;

/**
 * HDFS Sink
 */
@Plugin(type = "batchsink")
@Name("HDFS")
@Description("Batch HDFS Sink")
public class HDFSSink extends ReferenceBatchSink<StructuredRecord, Text, NullWritable> {
    private HDFSSinkConfig config;

    public HDFSSink(HDFSSinkConfig config) {
        super(config);
        this.config = config;
    }

    @Override
    public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
        super.configurePipeline(pipelineConfigurer);
        // Verify if the timeSuffix format is valid.
        if (!Strings.isNullOrEmpty(config.timeSufix)) {
            new SimpleDateFormat(config.timeSufix);
        }
    }

    @Override
    public void prepareRun(BatchSinkContext context) throws Exception {
        context.addOutput(
                Output.of(config.referenceName, new SinkOutputFormatProvider(config, context)).alias(config.path));
    }

    @Override
    public void transform(StructuredRecord input, Emitter<KeyValue<Text, NullWritable>> emitter) throws Exception {
        List<String> dataArray = new ArrayList<>();
        for (Schema.Field field : input.getSchema().getFields()) {
            dataArray.add(input.get(field.getName()).toString());
        }
        emitter.emit(new KeyValue<>(new Text(Joiner.on(",").join(dataArray)), NullWritable.get()));
    }

    /**
     * HDFS Sink Output Provider.
     */
    public static class SinkOutputFormatProvider implements OutputFormatProvider {
        private final Map<String, String> conf;
        private final HDFSSinkConfig config;

        public SinkOutputFormatProvider(HDFSSinkConfig config, BatchSinkContext context) {
            this.conf = new HashMap<>();
            this.config = config;
            String timeSuffix = !Strings.isNullOrEmpty(config.timeSufix)
                    ? new SimpleDateFormat(config.timeSufix).format(context.getLogicalStartTime())
                    : "";
            conf.put(FileOutputFormat.OUTDIR, String.format("%s/%s", config.path, timeSuffix));
        }

        @Override
        public String getOutputFormatClassName() {
            return outputFormatClassName("TEXT");
        }

        @Override
        public Map<String, String> getOutputFormatConfiguration() {
            return conf;
        }
    }

    private static String outputFormatClassName(String option) {
        // Use option to extend it to more output formats
        return TextOutputFormat.class.getName();
    }

    /**
     * Config for HDFSSinkConfig.
     */
    public static class HDFSSinkConfig extends ReferencePluginConfig {

        @Name("path")
        @Description("HDFS Destination Path Prefix. For example, 'hdfs://mycluster.net:8020/output")
        private String path;

        @Name("suffix")
        @Description("Time Suffix used for destination directory for each run. For example, 'YYYY-MM-dd-HH-mm'. "
                + "By default, no time suffix is used.")
        @Nullable
        private String timeSufix;

        public HDFSSinkConfig(String referenceName, String path, String suffix, String outputFormat) {
            super(referenceName);
            this.path = path;
            this.timeSufix = suffix;
        }
    }
}