Java tutorial
/* * Copyright 2014 Cloudera, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.examples.spark; import com.google.common.base.Objects; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; import org.kitesdk.data.event.CorrelatedEvents; import org.kitesdk.data.event.StandardEvent; import org.kitesdk.data.mapreduce.DatasetKeyInputFormat; import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat; import scala.Tuple2; public class CorrelateEventsTask implements Serializable { private static final long FIVE_MIN_MILLIS = TimeUnit.MINUTES.toMillis(5); String eventsUri; String correlatedEventsUri; public CorrelateEventsTask(String eventsUri, String correlatedEventsUri) { this.eventsUri = eventsUri; this.correlatedEventsUri = correlatedEventsUri; } /* * This task correlates events based on IP address and timestamp. The goal is * to find any "click" events that come from the same IP address and occur * within 5 minutes of an "alert" event. The process works by first converting * timestamps into 5 minute increments. This means each event will be mapped * to the nearest 5 minute mark before the event happened and the nearest * 5 minute mark after the event happened. These rounded timestamps are * combined with the IP address of the event to do an approximate self join of * the data. The events are then iterated over to check for two conditions: * * 1) There is an alert event in the same bucket * 2) That alert is actually less than 5 minutes apart from the given click * * The task will write out all of the "alert" events that have at least one * "click" event from the same IP address and within 5 minutes along with the * list of "click" events that were correlated. */ public void run() throws IOException { Configuration conf = new Configuration(); DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class); DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class); // Create our Spark configuration and get a Java context SparkConf sparkConf = new SparkConf().setAppName("Correlate Events") // Configure the use of Kryo serialization including our Avro registrator .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.kryo.registrator", "org.kitesdk.examples.spark.AvroKyroRegistrator"); JavaSparkContext sparkContext = new JavaSparkContext(sparkConf); JavaPairRDD<StandardEvent, Void> events = sparkContext.newAPIHadoopRDD(conf, DatasetKeyInputFormat.class, StandardEvent.class, Void.class); // Map each event to two correlation keys. One with the IP address and the // nearest 5 minute interval that happened before the event and one with the // IP address and the nearest 5 minute interval that happened after the event JavaPairRDD<CorrelationKey, StandardEvent> mappedEvents = events.flatMapToPair( new PairFlatMapFunction<Tuple2<StandardEvent, Void>, CorrelationKey, StandardEvent>() { @Override public Iterable<Tuple2<CorrelationKey, StandardEvent>> call(Tuple2<StandardEvent, Void> t) throws Exception { List<Tuple2<CorrelationKey, StandardEvent>> result = new ArrayList<Tuple2<CorrelationKey, StandardEvent>>( 2); StandardEvent event = t._1(); long loTimestamp = createLoTimestamp(event.getTimestamp()); long hiTimestamp = createHiTimestamp(event.getTimestamp()); String ip = event.getIp().toString(); result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(loTimestamp, ip), event)); result.add(new Tuple2<CorrelationKey, StandardEvent>(new CorrelationKey(hiTimestamp, ip), event)); return result; } }); // Group the events by they correlation key JavaPairRDD<CorrelationKey, Iterable<StandardEvent>> groupedEvents = mappedEvents.groupByKey(); // Generate potential matches by creating a list of alerts along with the // matched list of clicks. If no alerts were found with this correlation // key, then output an empty pair JavaPairRDD<List<StandardEvent>, List<StandardEvent>> potentialMatches = groupedEvents.mapToPair( new PairFunction<Tuple2<CorrelationKey, Iterable<StandardEvent>>, List<StandardEvent>, List<StandardEvent>>() { @Override public Tuple2<List<StandardEvent>, List<StandardEvent>> call( Tuple2<CorrelationKey, Iterable<StandardEvent>> t) throws Exception { Iterable<StandardEvent> allEvents = t._2(); List<StandardEvent> alerts = new ArrayList<StandardEvent>(); List<StandardEvent> clicks = new ArrayList<StandardEvent>(); for (StandardEvent event : allEvents) { if (event.getEventDetails() != null && event.getEventDetails().containsKey(new Utf8("type")) && "alert".equals(event.getEventDetails().get(new Utf8("type")).toString())) { alerts.add(event); } else if (event.getEventDetails() != null && event.getEventDetails().containsKey(new Utf8("type")) && "click".equals(event.getEventDetails().get(new Utf8("type")).toString())) { clicks.add(event); } } if (alerts.isEmpty()) { return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, alerts); } else { return new Tuple2<List<StandardEvent>, List<StandardEvent>>(alerts, clicks); } } }); // Verify that the matched events are true matches (i.e. the timestamps // are really less than or equal to 5 minutes apart JavaPairRDD<CorrelatedEvents, Void> matches = potentialMatches.flatMapToPair( new PairFlatMapFunction<Tuple2<List<StandardEvent>, List<StandardEvent>>, CorrelatedEvents, Void>() { @Override public Iterable<Tuple2<CorrelatedEvents, Void>> call( Tuple2<List<StandardEvent>, List<StandardEvent>> t) throws Exception { List<Tuple2<CorrelatedEvents, Void>> results = new ArrayList<Tuple2<CorrelatedEvents, Void>>(); List<StandardEvent> alerts = t._1(); List<StandardEvent> clicks = t._2(); for (StandardEvent alert : alerts) { List<StandardEvent> correlated = new ArrayList<StandardEvent>(); for (StandardEvent click : clicks) { if (Math.abs(alert.getTimestamp() - click.getTimestamp()) <= FIVE_MIN_MILLIS) { correlated.add(click); } } if (!correlated.isEmpty()) { results.add(new Tuple2(CorrelatedEvents.newBuilder().setEvent(alert) .setCorrelated(correlated).build(), null)); } } return results; } }); // Write the data to a Kite dataset matches.saveAsNewAPIHadoopFile("dummy", CorrelatedEvents.class, Void.class, DatasetKeyOutputFormat.class, conf); } private static long createLoTimestamp(long timestamp) { return timestamp - (timestamp % FIVE_MIN_MILLIS) - FIVE_MIN_MILLIS; } private static long createHiTimestamp(long timestamp) { return timestamp - (timestamp % FIVE_MIN_MILLIS) + FIVE_MIN_MILLIS; } private static class CorrelationKey implements Serializable { Long timeStamp; String ip; public CorrelationKey(Long timeStamp, String ip) { this.timeStamp = timeStamp; this.ip = ip; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public Long getTimeStamp() { return timeStamp; } public void setTimeStamp(Long timeStamp) { this.timeStamp = timeStamp; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final CorrelationKey other = (CorrelationKey) obj; return Objects.equal(this.timeStamp, other.timeStamp) && Objects.equal(this.ip, other.ip); } @Override public int hashCode() { return Objects.hashCode(timeStamp, ip); } } }