org.kitesdk.examples.spark.CorrelateEventsTask.java Source code

Java tutorial

Introduction

Here is the source code for org.kitesdk.examples.spark.CorrelateEventsTask.java

Source

/*
 * Copyright 2014 Cloudera, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kitesdk.examples.spark;

import com.google.common.base.Objects;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.kitesdk.data.event.CorrelatedEvents;
import org.kitesdk.data.event.StandardEvent;
import org.kitesdk.data.mapreduce.DatasetKeyInputFormat;
import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat;
import scala.Tuple2;

public class CorrelateEventsTask implements Serializable {

    private static final long FIVE_MIN_MILLIS = TimeUnit.MINUTES.toMillis(5);
    String eventsUri;
    String correlatedEventsUri;

    public CorrelateEventsTask(String eventsUri, String correlatedEventsUri) {
        this.eventsUri = eventsUri;
        this.correlatedEventsUri = correlatedEventsUri;
    }

    /*
     * This task correlates events based on IP address and timestamp. The goal is
     * to find any "click" events that come from the same IP address and occur
     * within 5 minutes of an "alert" event. The process works by first converting
     * timestamps into 5 minute increments. This means each event will be mapped
     * to the nearest 5 minute mark before the event happened and the nearest
     * 5 minute mark after the event happened. These rounded timestamps are
     * combined with the IP address of the event to do an approximate self join of
     * the data. The events are then iterated over to check for two conditions:
     *
     *   1) There is an alert event in the same bucket
     *   2) That alert is actually less than 5 minutes apart from the given click
     *
     * The task will write out all of the "alert" events that have at least one
     * "click" event from the same IP address and within 5 minutes along with the
     * list of "click" events that were correlated.
     */
    public void run() throws IOException {
        Configuration conf = new Configuration();
        DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class);
        DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class);

        // Create our Spark configuration and get a Java context
        SparkConf sparkConf = new SparkConf().setAppName("Correlate Events")
                // Configure the use of Kryo serialization including our Avro registrator
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
                .set("spark.kryo.registrator", "org.kitesdk.examples.spark.AvroKyroRegistrator");
        JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

        JavaPairRDD<StandardEvent, Void> events = sparkContext.newAPIHadoopRDD(conf, DatasetKeyInputFormat.class,
                StandardEvent.class, Void.class);

        // Map each event to two correlation keys. One with the IP address and the
        // nearest 5 minute interval that happened before the event and one with the
        // IP address and the nearest 5 minute interval that happened after the event
        JavaPairRDD<CorrelationKey, StandardEvent> mappedEvents = events.flatMapToPair(
                new PairFlatMapFunction<Tuple2<StandardEvent, Void>, CorrelationKey, StandardEvent>() {
                    @Override
                    public Iterable<Tuple2<CorrelationKey, StandardEvent>> call(Tuple2<StandardEvent, Void> t)
                            throws Exception {
                        List<Tuple2<CorrelationKey, StandardEvent>> result = new ArrayList<Tuple2<CorrelationKey, StandardEvent>>(
                                2);

                        StandardEvent event = t._1();
                        long loTimestamp = createLoTimestamp(event.getTimestamp());
                        long hiTimestamp = createHiTimestamp(event.getTimestamp());
                        String ip = event.getIp().toString();

                        result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(loTimestamp, ip),
                                event));
                        result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(hiTimestamp, ip),
                                event));

                        return result;
                    }
                });

        // Group the events by they correlation key
        JavaPairRDD<CorrelationKey, Iterable<StandardEvent>> groupedEvents = mappedEvents.groupByKey();

        // Generate potential matches by creating a list of alerts along with the
        // matched list of clicks. If no alerts were found with this correlation
        // key, then output an empty pair
        JavaPairRDD<List<StandardEvent>, List<StandardEvent>> potentialMatches = groupedEvents.mapToPair(
                new PairFunction<Tuple2<CorrelationKey, Iterable<StandardEvent>>, List<StandardEvent>, List<StandardEvent>>() {

                    @Override
                    public Tuple2<List<StandardEvent>, List<StandardEvent>> call(
                            Tuple2<CorrelationKey, Iterable<StandardEvent>> t) throws Exception {
                        Iterable<StandardEvent> allEvents = t._2();
                        List<StandardEvent> alerts = new ArrayList<StandardEvent>();
                        List<StandardEvent> clicks = new ArrayList<StandardEvent>();

                        for (StandardEvent event : allEvents) {
                            if (event.getEventDetails() != null
                                    && event.getEventDetails().containsKey(new Utf8("type"))
                                    && "alert".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                                alerts.add(event);
                            } else if (event.getEventDetails() != null
                                    && event.getEventDetails().containsKey(new Utf8("type"))
                                    && "click".equals(event.getEventDetails().get(new Utf8("type")).toString())) {
                                clicks.add(event);
                            }
                        }

                        if (alerts.isEmpty()) {
                            return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, alerts);
                        } else {
                            return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, clicks);
                        }
                    }
                });

        // Verify that the matched events are true matches (i.e. the timestamps
        // are really less than or equal to 5 minutes apart
        JavaPairRDD<CorrelatedEvents, Void> matches = potentialMatches.flatMapToPair(
                new PairFlatMapFunction<Tuple2<List<StandardEvent>, List<StandardEvent>>, CorrelatedEvents, Void>() {

                    @Override
                    public Iterable<Tuple2<CorrelatedEvents, Void>> call(
                            Tuple2<List<StandardEvent>, List<StandardEvent>> t) throws Exception {
                        List<Tuple2<CorrelatedEvents, Void>> results = new ArrayList<Tuple2<CorrelatedEvents, Void>>();
                        List<StandardEvent> alerts = t._1();
                        List<StandardEvent> clicks = t._2();

                        for (StandardEvent alert : alerts) {
                            List<StandardEvent> correlated = new ArrayList<StandardEvent>();
                            for (StandardEvent click : clicks) {
                                if (Math.abs(alert.getTimestamp() - click.getTimestamp()) <= FIVE_MIN_MILLIS) {
                                    correlated.add(click);
                                }
                            }
                            if (!correlated.isEmpty()) {
                                results.add(new Tuple2(CorrelatedEvents.newBuilder().setEvent(alert)
                                        .setCorrelated(correlated).build(), null));
                            }
                        }

                        return results;
                    }
                });

        // Write the data to a Kite dataset
        matches.saveAsNewAPIHadoopFile("dummy", CorrelatedEvents.class, Void.class, DatasetKeyOutputFormat.class,
                conf);
    }

    private static long createLoTimestamp(long timestamp) {
        return timestamp - (timestamp % FIVE_MIN_MILLIS) - FIVE_MIN_MILLIS;
    }

    private static long createHiTimestamp(long timestamp) {
        return timestamp - (timestamp % FIVE_MIN_MILLIS) + FIVE_MIN_MILLIS;
    }

    private static class CorrelationKey implements Serializable {
        Long timeStamp;
        String ip;

        public CorrelationKey(Long timeStamp, String ip) {
            this.timeStamp = timeStamp;
            this.ip = ip;
        }

        public String getIp() {
            return ip;
        }

        public void setIp(String ip) {
            this.ip = ip;
        }

        public Long getTimeStamp() {
            return timeStamp;
        }

        public void setTimeStamp(Long timeStamp) {
            this.timeStamp = timeStamp;
        }

        @Override
        public boolean equals(Object obj) {
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            final CorrelationKey other = (CorrelationKey) obj;

            return Objects.equal(this.timeStamp, other.timeStamp) && Objects.equal(this.ip, other.ip);
        }

        @Override
        public int hashCode() {
            return Objects.hashCode(timeStamp, ip);
        }
    }
}