List of usage examples for org.apache.spark.api.java.function MapGroupsWithStateFunction MapGroupsWithStateFunction
MapGroupsWithStateFunction
From source file:gtl.spark.java.example.apache.sql.streaming.JavaStructuredSessionization.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaStructuredSessionization <hostname> <port>"); System.exit(1);//from www. ja v a 2 s. co m } String host = args[0]; int port = Integer.parseInt(args[1]); SparkSession spark = SparkSession.builder().appName("JavaStructuredSessionization").getOrCreate(); // Create DataFrame representing the stream of input lines from connection to host:port Dataset<Row> lines = spark.readStream().format("socket").option("host", host).option("port", port) .option("includeTimestamp", true).load(); FlatMapFunction<LineWithTimestamp, Event> linesToEvents = new FlatMapFunction<LineWithTimestamp, Event>() { @Override public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exception { ArrayList<Event> eventList = new ArrayList<Event>(); for (String word : lineWithTimestamp.getLine().split(" ")) { eventList.add(new Event(word, lineWithTimestamp.getTimestamp())); } return eventList.iterator(); } }; // Split the lines into words, treat words as sessionId of events Dataset<Event> events = lines.withColumnRenamed("value", "line").as(Encoders.bean(LineWithTimestamp.class)) .flatMap(linesToEvents, Encoders.bean(Event.class)); // Sessionize the events. Track number of events, start and end timestamps of session, and // and report session updates. // // Step 1: Define the state update function MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate> stateUpdateFunc = new MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate>() { @Override public SessionUpdate call(String sessionId, Iterator<Event> events, GroupState<SessionInfo> state) throws Exception { // If timed out, then remove session and send final update if (state.hasTimedOut()) { SessionUpdate finalUpdate = new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(), true); state.remove(); return finalUpdate; } else { // Find max and min timestamps in events long maxTimestampMs = Long.MIN_VALUE; long minTimestampMs = Long.MAX_VALUE; int numNewEvents = 0; while (events.hasNext()) { Event e = events.next(); long timestampMs = e.getTimestamp().getTime(); maxTimestampMs = Math.max(timestampMs, maxTimestampMs); minTimestampMs = Math.min(timestampMs, minTimestampMs); numNewEvents += 1; } SessionInfo updatedSession = new SessionInfo(); // Update start and end timestamps in session if (state.exists()) { SessionInfo oldSession = state.get(); updatedSession.setNumEvents(oldSession.numEvents + numNewEvents); updatedSession.setStartTimestampMs(oldSession.startTimestampMs); updatedSession.setEndTimestampMs(Math.max(oldSession.endTimestampMs, maxTimestampMs)); } else { updatedSession.setNumEvents(numNewEvents); updatedSession.setStartTimestampMs(minTimestampMs); updatedSession.setEndTimestampMs(maxTimestampMs); } state.update(updatedSession); // Set timeout such that the session will be expired if no data received for 10 seconds state.setTimeoutDuration("10 seconds"); return new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(), false); } } }; // Step 2: Apply the state update function to the events streaming Dataset grouped by sessionId Dataset<SessionUpdate> sessionUpdates = events.groupByKey(new MapFunction<Event, String>() { @Override public String call(Event event) throws Exception { return event.getSessionId(); } }, Encoders.STRING()).mapGroupsWithState(stateUpdateFunc, Encoders.bean(SessionInfo.class), Encoders.bean(SessionUpdate.class), GroupStateTimeout.ProcessingTimeTimeout()); // Start running the query that prints the session updates to the console StreamingQuery query = sessionUpdates.writeStream().outputMode("update").format("console").start(); query.awaitTermination(); }