Example usage for org.apache.spark.api.java.function MapGroupsWithStateFunction MapGroupsWithStateFunction

List of usage examples for org.apache.spark.api.java.function MapGroupsWithStateFunction MapGroupsWithStateFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function MapGroupsWithStateFunction MapGroupsWithStateFunction.

Prototype

MapGroupsWithStateFunction

Source Link

Usage

From source file:gtl.spark.java.example.apache.sql.streaming.JavaStructuredSessionization.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaStructuredSessionization <hostname> <port>");
        System.exit(1);//from   www. ja v  a 2 s.  co  m
    }

    String host = args[0];
    int port = Integer.parseInt(args[1]);

    SparkSession spark = SparkSession.builder().appName("JavaStructuredSessionization").getOrCreate();

    // Create DataFrame representing the stream of input lines from connection to host:port
    Dataset<Row> lines = spark.readStream().format("socket").option("host", host).option("port", port)
            .option("includeTimestamp", true).load();

    FlatMapFunction<LineWithTimestamp, Event> linesToEvents = new FlatMapFunction<LineWithTimestamp, Event>() {
        @Override
        public Iterator<Event> call(LineWithTimestamp lineWithTimestamp) throws Exception {
            ArrayList<Event> eventList = new ArrayList<Event>();
            for (String word : lineWithTimestamp.getLine().split(" ")) {
                eventList.add(new Event(word, lineWithTimestamp.getTimestamp()));
            }
            return eventList.iterator();
        }
    };

    // Split the lines into words, treat words as sessionId of events
    Dataset<Event> events = lines.withColumnRenamed("value", "line").as(Encoders.bean(LineWithTimestamp.class))
            .flatMap(linesToEvents, Encoders.bean(Event.class));

    // Sessionize the events. Track number of events, start and end timestamps of session, and
    // and report session updates.
    //
    // Step 1: Define the state update function
    MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate> stateUpdateFunc = new MapGroupsWithStateFunction<String, Event, SessionInfo, SessionUpdate>() {
        @Override
        public SessionUpdate call(String sessionId, Iterator<Event> events, GroupState<SessionInfo> state)
                throws Exception {
            // If timed out, then remove session and send final update
            if (state.hasTimedOut()) {
                SessionUpdate finalUpdate = new SessionUpdate(sessionId, state.get().calculateDuration(),
                        state.get().getNumEvents(), true);
                state.remove();
                return finalUpdate;

            } else {
                // Find max and min timestamps in events
                long maxTimestampMs = Long.MIN_VALUE;
                long minTimestampMs = Long.MAX_VALUE;
                int numNewEvents = 0;
                while (events.hasNext()) {
                    Event e = events.next();
                    long timestampMs = e.getTimestamp().getTime();
                    maxTimestampMs = Math.max(timestampMs, maxTimestampMs);
                    minTimestampMs = Math.min(timestampMs, minTimestampMs);
                    numNewEvents += 1;
                }
                SessionInfo updatedSession = new SessionInfo();

                // Update start and end timestamps in session
                if (state.exists()) {
                    SessionInfo oldSession = state.get();
                    updatedSession.setNumEvents(oldSession.numEvents + numNewEvents);
                    updatedSession.setStartTimestampMs(oldSession.startTimestampMs);
                    updatedSession.setEndTimestampMs(Math.max(oldSession.endTimestampMs, maxTimestampMs));
                } else {
                    updatedSession.setNumEvents(numNewEvents);
                    updatedSession.setStartTimestampMs(minTimestampMs);
                    updatedSession.setEndTimestampMs(maxTimestampMs);
                }
                state.update(updatedSession);
                // Set timeout such that the session will be expired if no data received for 10 seconds
                state.setTimeoutDuration("10 seconds");
                return new SessionUpdate(sessionId, state.get().calculateDuration(), state.get().getNumEvents(),
                        false);
            }
        }
    };

    // Step 2: Apply the state update function to the events streaming Dataset grouped by sessionId
    Dataset<SessionUpdate> sessionUpdates = events.groupByKey(new MapFunction<Event, String>() {
        @Override
        public String call(Event event) throws Exception {
            return event.getSessionId();
        }
    }, Encoders.STRING()).mapGroupsWithState(stateUpdateFunc, Encoders.bean(SessionInfo.class),
            Encoders.bean(SessionUpdate.class), GroupStateTimeout.ProcessingTimeTimeout());

    // Start running the query that prints the session updates to the console
    StreamingQuery query = sessionUpdates.writeStream().outputMode("update").format("console").start();

    query.awaitTermination();
}