org.apache.blur.spark.Consumer.java Source code

Introduction

Here is the source code for org.apache.blur.spark.Consumer.java
Source

package org.apache.blur.spark;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.blur.manager.BlurPartitioner;
import org.apache.blur.mapreduce.lib.BlurMapReduceUtil;
import org.apache.blur.mapreduce.lib.BlurMutate;
import org.apache.blur.mapreduce.lib.BlurOutputCommitter;
import org.apache.blur.mapreduce.lib.BlurOutputFormat;
import org.apache.blur.mapreduce.lib.DefaultBlurReducer;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Time;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import scala.Tuple2;
import consumer.kafka.KafkaConfig;
import consumer.kafka.MessageAndMetadata;
import consumer.kafka.client.KafkaReceiver;

/*
 * This Consumer uses Spark RDD saveAsNewAPIHadoopFile API to index BlurMutate
 */

public class Consumer implements Serializable {

    private static final long serialVersionUID = 4332618245650072140L;
    private Properties _props;
    private KafkaConfig _kafkaConfig;

    public void start() throws InstantiationException, IllegalAccessException, ClassNotFoundException {

        _kafkaConfig = new KafkaConfig(_props);
        run();
    }

    private void init(String[] args) throws Exception {

        Options options = new Options();
        this._props = new Properties();

        options.addOption("p", true, "properties filename from the classpath");
        options.addOption("P", true, "external properties filename");

        OptionBuilder.withArgName("property=value");
        OptionBuilder.hasArgs(2);
        OptionBuilder.withValueSeparator();
        OptionBuilder.withDescription("use value for given property");
        options.addOption(OptionBuilder.create("D"));

        CommandLineParser parser = new PosixParser();
        CommandLine cmd = parser.parse(options, args);
        if (cmd.hasOption('p')) {
            this._props.load(ClassLoader.getSystemClassLoader().getResourceAsStream(cmd.getOptionValue('p')));
        }
        if (cmd.hasOption('P')) {
            File file = new File(cmd.getOptionValue('P'));
            FileInputStream fStream = new FileInputStream(file);
            this._props.load(fStream);
        }
        this._props.putAll(cmd.getOptionProperties("D"));

    }

    private void run() {

        String checkpointDirectory = "hdfs://10.252.5.113:9000/user/hadoop/spark";

        // number of partition for Kafka Topic

        int _partitionCount = 5;

        List<JavaDStream<MessageAndMetadata>> streamsList = new ArrayList<JavaDStream<MessageAndMetadata>>(
                _partitionCount);
        JavaDStream<MessageAndMetadata> unionStreams;

        SparkConf conf = new SparkConf().setAppName("KafkaReceiver").set("spark.streaming.blockInterval", "200");

        // Path to Blur Libraries . Can be copied to each Node of Spark Cluster.

        conf.set("spark.executor.extraClassPath", "/home/apache-blur-0.2.4/lib/*");

        // Used KryoSerializer for BlurMutate and Text.
        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");

        JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(3000));

        /*
         * Receive Kafka Stream. Create individual Receivers for each Topic
         * Partition
         */

        for (int i = 0; i < _partitionCount; i++) {

            streamsList.add(ssc.receiverStream(new KafkaReceiver(_props, i)));

        }

        /*
         * Union all the streams if there is more than 1 stream
         */

        if (streamsList.size() > 1) {
            unionStreams = ssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
        } else {
            // Otherwise, just use the 1 stream
            unionStreams = streamsList.get(0);
        }

        /*
         * Generate JavaPairDStream
         */

        JavaPairDStream<Text, BlurMutate> pairDStream = unionStreams
                .mapToPair(new PairFunction<MessageAndMetadata, Text, BlurMutate>() {

                    private static final long serialVersionUID = 443235214978L;

                    public Tuple2<Text, BlurMutate> call(MessageAndMetadata mmeta) {

                        /*
                         * create the BlurMutate from MessageAndMetadata
                         */

                        String message = new String(mmeta.getPayload());
                        String keyStr = DigestUtils.shaHex(message);
                        Text key = new Text((keyStr).getBytes());
                        BlurMutate mutate = new BlurMutate(BlurMutate.MUTATE_TYPE.REPLACE, keyStr, keyStr,
                                "family");
                        mutate.addColumn("message", message);

                        return new Tuple2<Text, BlurMutate>(key, mutate);
                    }
                });

        pairDStream.foreachRDD(new Function2<JavaPairRDD<Text, BlurMutate>, Time, Void>() {

            private static final long serialVersionUID = 88875777435L;

            @Override
            public Void call(JavaPairRDD<Text, BlurMutate> rdd, Time time) throws Exception {

                /*
                 * Blur Table Details
                 */
                TableDescriptor tableDescriptor = new TableDescriptor();
                String tableUri = new Path("hdfs://10.252.5.113:9000/blur/tables/nrt").toString();
                tableDescriptor.tableUri = tableUri;
                tableDescriptor.cluster = "pearson";
                tableDescriptor.name = "nrt";
                tableDescriptor.shardCount = 9;
                Configuration conf = new Configuration();

                /*
                 * Partition RDD to match Blur Table Shard Count. Used
                 * Custom Partitioner to channel correct BlurMutate to
                 * correct Shard.
                 */

                final JavaPairRDD<Text, BlurMutate> pRdd = rdd
                        .partitionBy(new BlurSparkPartitioner(tableDescriptor.shardCount))
                        .persist(StorageLevel.MEMORY_ONLY_2());

                /*
                 * Blur specific Configuration
                 */

                BlurOutputFormat.setIndexLocally(conf, false);
                BlurOutputFormat.setOptimizeInFlight(conf, false);
                conf.setClass("mapreduce.reduce.class", DefaultBlurReducer.class, Reducer.class);
                conf.setClass("mapreduce.outputformat.class", BlurOutputFormat.class, OutputFormat.class);
                conf.setClass("mapreduce.partitioner.class", BlurPartitioner.class, Partitioner.class);
                conf.set("mapred.output.committer.class", BlurOutputCommitter.class.getName());
                conf.setInt("blur.output.max.document.buffer.size", 10000);

                BlurOutputFormat.setTableDescriptor(conf, tableDescriptor);

                JobConf jobConf = new JobConf(conf);

                jobConf.setNumReduceTasks(tableDescriptor.shardCount);
                jobConf.setOutputKeyClass(Text.class);
                jobConf.setOutputValueClass(BlurMutate.class);

                BlurMapReduceUtil.addAllJarsInBlurLib(conf);
                BlurMapReduceUtil.addDependencyJars(conf, org.apache.zookeeper.ZooKeeper.class,
                        org.apache.lucene.codecs.lucene42.Lucene42Codec.class, jobConf.getOutputKeyClass(),
                        jobConf.getOutputValueClass());

                /*
                 * Write the RDD to Blur Table
                 */

                if (pRdd.count() > 0)
                    pRdd.saveAsNewAPIHadoopFile(tableUri, Text.class, BlurMutate.class, BlurOutputFormat.class,
                            jobConf);

                return null;
            }
        });

        // ssc.checkpoint(checkpointDirectory);
        ssc.start();
        ssc.awaitTermination();
    }

    public static void main(String[] args) throws Exception {

        Consumer consumer = new Consumer();
        consumer.init(args);
        consumer.start();
    }
}