com.google.cloud.sparkdemo.HourlyTeamScore.java Source code

Introduction

Here is the source code for com.google.cloud.sparkdemo.HourlyTeamScore.java
Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 * This is not an official Google product.
 */

package com.google.cloud.sparkdemo;

import com.google.gson.JsonObject;

import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.joda.time.DateTimeZone;
import org.joda.time.Duration;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import scala.Tuple2;

import java.util.TimeZone;

public class HourlyTeamScore extends UserScore {
    public final static DateTimeFormatter timestampFormatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
            .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
    private final static DateTimeFormatter timestampParser = DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm")
            .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));

    public static class HourlyTeamScoreOptions extends UserScoreOptions {
        public HourlyTeamScoreOptions() {
            options.addOption("windowDuration", true, "Numeric value of fixed window duration, in minutes");
            options.addOption("startMin", true,
                    "String representation of the first minute for which to generate results,"
                            + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
                            + "Any input data timestamped prior to that minute won't be included in the sums.");
            options.addOption("stopMin", true,
                    "String representation of the first minute for which to not generate results,"
                            + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
                            + "Any input data timestamped after that minute won't be included in the sums.");
        }

        public String getStartMin() {
            return getOptionWithDefaultValue("startMin", "1970-01-01-00-00");
        }

        public String getStopMin() {
            return getOptionWithDefaultValue("stopMin", "2100-01-01-00-00");
        }

        public Long getWindowDuration() {
            return Long.valueOf(getOptionWithDefaultValue("windowDuration", "60"));
        }

        public String getTableName() {
            return getOptionWithDefaultValue("tableName", "Hourly");
        }

        public String getTableSchema() {
            return "[" + "{'name': 'team','type': 'STRING'}," + "{'name': 'total_score','type': 'INTEGER'},"
                    + "{'name': 'window_start','type': 'STRING'}" + "]";
        }
    }

    // A helper generic for attaching timestamp to a given value
    public static class WithTimestamp<T> extends Tuple2<T, Long> {
        WithTimestamp(T val, Long timestamp) {
            super(val, timestamp);
        }

        T val() {
            return _1();
        }

        Long timestamp() {
            return _2();
        }

        public static <T> WithTimestamp<T> create(T val, Long timestamp) {
            return new WithTimestamp<T>(val, timestamp);
        }
    }

    // Helper to convert <<team, window_start>, score> tuples to JSON objects, which is what we
    // need to output them to BigQuery.
    private static PairFunction<Tuple2<WithTimestamp<String>, Integer>, String, JsonObject> convertToJson = (
            pairs) -> {
        JsonObject jsonObject = new JsonObject();
        jsonObject.addProperty("team", pairs._1().val());
        jsonObject.addProperty("total_score", pairs._2());
        jsonObject.addProperty("window_start", timestampFormatter.print(pairs._1().timestamp()));
        return new Tuple2<String, JsonObject>(null, jsonObject);
    };

    /**
     * Run a batch pipeline.
     **/
    public static void main(String[] args) throws Exception {
        HourlyTeamScoreOptions options = new HourlyTeamScoreOptions();
        options.parse(args);

        SparkConf sc = new SparkConf().setAppName("HourlyTeamScore");
        JavaSparkContext jsc = new JavaSparkContext(sc);

        Configuration hadoopConf = jsc.hadoopConfiguration();
        configureBigQueryOutput(hadoopConf, options.getProject(), options.getDataset(), options.getTableName(),
                options.getTableSchema());

        final Long startMinTimestamp = timestampParser.parseMillis(options.getStartMin());
        final Long stopMinTimestamp = timestampParser.parseMillis(options.getStopMin());
        final Long windowDuration = Duration.standardMinutes(options.getWindowDuration()).getMillis();

        // Run a pipeline to analyze all the data in batch.
        // First, read events from a text file and parse them.
        JavaRDD<GameActionInfo> gameEvents = jsc.textFile(options.getInput()).flatMap(new ParseEventFn())
                // Filter out data before and after the given times so that it is not included
                // in the calculations. As we collect data in batches (say, by day), the batch for
                // the day that we want to analyze could potentially include some late-arriving
                // data from the previous day. If so, we want to weed it out. Similarly, if we include
                // data from the following day (to scoop up late-arriving events from the day we're
                // analyzing), we need to weed out events that fall after the time period we want to
                // analyze.
                .filter((GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp)
                .filter((GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp);

        JavaPairRDD<WithTimestamp<String>, Integer> hourlyTeamScores = gameEvents.mapToPair(event -> new Tuple2<>(
                // Extract the composite key as <team, window_start_time>
                WithTimestamp.create(event.getTeam(),
                        // Apply Fixed Window by rounding the timestamp down to the nearest
                        // multiple of the window size
                        (event.getTimestamp() / windowDuration) * windowDuration),
                // Extract the scores as values
                event.getScore()))
                // Compute the sum of the scores per team per window
                .reduceByKey(new SumScore());

        // Write to a BigQuery table
        JavaPairRDD<String, JsonObject> jsonPairs = hourlyTeamScores.mapToPair(convertToJson);
        jsonPairs.saveAsNewAPIHadoopDataset(hadoopConf);
    }
}