kafka.etl.impl.DataGenerator.java Source code

Java tutorial

Introduction

Here is the source code for kafka.etl.impl.DataGenerator.java

Source

/*
 * Copyright 2010 LinkedIn
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package kafka.etl.impl;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Map.Entry;
import java.util.Properties;

import kafka.message.NoCompressionCodec;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;

import kafka.etl.KafkaETLKey;
import kafka.etl.KafkaETLRequest;
import kafka.etl.KafkaETLUtils;
import kafka.etl.Props;
import kafka.javaapi.message.ByteBufferMessageSet;
import kafka.message.Message;
import kafka.javaapi.producer.SyncProducer;
import kafka.producer.SyncProducerConfig;

/**
 * Use this class to produce test events to Kafka server. Each event contains a
 * random timestamp in text format.
 */
@SuppressWarnings("deprecation")
public class DataGenerator {

    protected final static Random RANDOM = new Random(System.currentTimeMillis());

    protected Props _props;
    protected SyncProducer _producer = null;
    protected URI _uri = null;
    protected String _topic;
    protected int _count;
    protected String _offsetsDir;
    protected final int TCP_BUFFER_SIZE = 300 * 1000;
    protected final int CONNECT_TIMEOUT = 20000; // ms
    protected final int RECONNECT_INTERVAL = Integer.MAX_VALUE; // ms

    public DataGenerator(String id, Props props) throws Exception {
        _props = props;
        _topic = props.getProperty("kafka.etl.topic");
        System.out.println("topics=" + _topic);
        _count = props.getInt("event.count");

        _offsetsDir = _props.getProperty("input");

        // initialize kafka producer to generate count events
        String serverUri = _props.getProperty("kafka.server.uri");
        _uri = new URI(serverUri);

        System.out.println("server uri:" + _uri.toString());
        Properties producerProps = new Properties();
        producerProps.put("host", _uri.getHost());
        producerProps.put("port", String.valueOf(_uri.getPort()));
        producerProps.put("buffer.size", String.valueOf(TCP_BUFFER_SIZE));
        producerProps.put("connect.timeout.ms", String.valueOf(CONNECT_TIMEOUT));
        producerProps.put("reconnect.interval", String.valueOf(RECONNECT_INTERVAL));
        _producer = new SyncProducer(new SyncProducerConfig(producerProps));

    }

    public void run() throws Exception {

        List<Message> list = new ArrayList<Message>();
        for (int i = 0; i < _count; i++) {
            Long timestamp = RANDOM.nextLong();
            if (timestamp < 0)
                timestamp = -timestamp;
            byte[] bytes = timestamp.toString().getBytes("UTF8");
            Message message = new Message(bytes);
            list.add(message);
        }
        // send events
        System.out.println(" send " + list.size() + " " + _topic + " count events to " + _uri);
        _producer.send(_topic, new ByteBufferMessageSet(kafka.message.NoCompressionCodec$.MODULE$, list));

        // close the producer
        _producer.close();

        // generate offset files
        generateOffsets();
    }

    protected void generateOffsets() throws Exception {
        JobConf conf = new JobConf();
        conf.set("hadoop.job.ugi", _props.getProperty("hadoop.job.ugi"));
        conf.setCompressMapOutput(false);
        Path outPath = new Path(_offsetsDir + Path.SEPARATOR + "1.dat");
        FileSystem fs = outPath.getFileSystem(conf);
        if (fs.exists(outPath))
            fs.delete(outPath);

        KafkaETLRequest request = new KafkaETLRequest(_topic, "tcp://" + _uri.getHost() + ":" + _uri.getPort(), 0);

        System.out.println("Dump " + request.toString() + " to " + outPath.toUri().toString());
        byte[] bytes = request.toString().getBytes("UTF-8");
        KafkaETLKey dummyKey = new KafkaETLKey();
        SequenceFile.setCompressionType(conf, SequenceFile.CompressionType.NONE);
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outPath, KafkaETLKey.class,
                BytesWritable.class);
        writer.append(dummyKey, new BytesWritable(bytes));
        writer.close();
    }

    public static void main(String[] args) throws Exception {

        if (args.length < 1)
            throw new Exception("Usage: - config_file");

        Props props = new Props(args[0]);
        DataGenerator job = new DataGenerator("DataGenerator", props);
        job.run();
    }

}