kafka.bridge.hadoop2.KafkaOutputFormat.java Source code

Introduction

Here is the source code for kafka.bridge.hadoop2.KafkaOutputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package kafka.bridge.hadoop2;

import java.io.IOException;
import java.net.URI;
import java.util.*;

import kafka.common.KafkaException;
import kafka.javaapi.producer.Producer;
import kafka.producer.ProducerConfig;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.log4j.Logger;

public class KafkaOutputFormat<K, V> extends OutputFormat<K, V> {
    private Logger log = Logger.getLogger(KafkaOutputFormat.class);

    public static final String KAFKA_URL = "kafka.output.url";
    /** Bytes to buffer before the OutputFormat does a send (i.e., the amortization window):
     *  We set the default to a million bytes so that the server will not reject the batch of messages
     *  with a MessageSizeTooLargeException. The actual size will be smaller after compression.
     */
    public static final int KAFKA_QUEUE_BYTES = 1000000;

    public static final String KAFKA_CONFIG_PREFIX = "kafka.output";
    private static final Map<String, String> kafkaConfigMap;
    static {
        Map<String, String> cMap = new HashMap<String, String>();

        // default Hadoop producer configs
        cMap.put("producer.type", "sync");
        cMap.put("compression.codec", Integer.toString(1));
        cMap.put("request.required.acks", Integer.toString(1));

        kafkaConfigMap = Collections.unmodifiableMap(cMap);
    }

    public KafkaOutputFormat() {
        super();
    }

    public static void setOutputPath(Job job, Path outputUrl) {
        job.getConfiguration().set(KafkaOutputFormat.KAFKA_URL, outputUrl.toString());

        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    }

    public static Path getOutputPath(JobContext job) {
        String name = job.getConfiguration().get(KafkaOutputFormat.KAFKA_URL);
        return name == null ? null : new Path(name);
    }

    @Override
    public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
    }

    @Override
    public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        // Is there a programmatic way to get the temp dir? I see it hardcoded everywhere in Hadoop, Hive, and Pig.
        return new FileOutputCommitter(
                new Path("/tmp/" + taskAttemptContext.getTaskAttemptID().getJobID().toString()),
                taskAttemptContext);
    }

    @Override
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
        Path outputPath = getOutputPath(context);
        if (outputPath == null)
            throw new KafkaException("no kafka output url specified");
        URI uri = URI.create(outputPath.toString());
        Configuration job = context.getConfiguration();

        Properties props = new Properties();
        String topic;

        props.putAll(kafkaConfigMap); // inject default configuration
        for (Map.Entry<String, String> m : job) { // handle any overrides
            if (!m.getKey().startsWith(KAFKA_CONFIG_PREFIX))
                continue;
            if (m.getKey().equals(KAFKA_URL))
                continue;

            String kafkaKeyName = m.getKey().substring(KAFKA_CONFIG_PREFIX.length() + 1);
            props.setProperty(kafkaKeyName, m.getValue()); // set Kafka producer property
        }

        // inject Kafka producer props back into jobconf for easier debugging
        for (Map.Entry<Object, Object> m : props.entrySet()) {
            job.set(KAFKA_CONFIG_PREFIX + "." + m.getKey().toString(), m.getValue().toString());
        }

        // KafkaOutputFormat specific parameters
        final int queueBytes = job.getInt(KAFKA_CONFIG_PREFIX + ".queue.bytes", KAFKA_QUEUE_BYTES);

        if (uri.getScheme().equals("kafka")) {
            // using the direct broker list
            // URL: kafka://<kafka host>/<topic>
            // e.g. kafka://kafka-server:9000,kafka-server2:9000/foobar
            String brokerList = uri.getAuthority();
            props.setProperty("metadata.broker.list", brokerList);
            job.set(KAFKA_CONFIG_PREFIX + ".metadata.broker.list", brokerList);

            if (uri.getPath() == null || uri.getPath().length() <= 1)
                throw new KafkaException("no topic specified in kafka uri");

            topic = uri.getPath().substring(1); // ignore the initial '/' in the path
            job.set(KAFKA_CONFIG_PREFIX + ".topic", topic);
            log.info(String.format("using kafka broker %s (topic %s)", brokerList, topic));
        } else
            throw new KafkaException("missing scheme from kafka uri (must be kafka://)");

        Producer<Object, byte[]> producer = new Producer<Object, byte[]>(new ProducerConfig(props));
        return new KafkaRecordWriter<K, V>(producer, topic, queueBytes);
    }
}