org.apache.beam.examples.tutorial.game.Exercise3.java Source code

Introduction

Here is the source code for org.apache.beam.examples.tutorial.game.Exercise3.java
Source

/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package org.apache.beam.examples.tutorial.game;

import org.apache.beam.examples.tutorial.game.GameActionInfo.KeyField;
import org.apache.beam.examples.tutorial.game.utils.ChangeMe;
import org.apache.beam.examples.tutorial.game.utils.ExerciseOptions;
import org.apache.beam.examples.tutorial.game.utils.Input;
import org.apache.beam.examples.tutorial.game.utils.Output;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;

import org.joda.time.Duration;

/**
 * This pipeline extends {@link Exercise1} by windowing each of the input events
 * based on the event timestamp.
 *
 * <p>
 * This pipeline processes data collected from gaming events in batch, building
 * on {@link Exercise1} but using fixed windows. It calculates the sum of scores
 * per team, for each window, optionally allowing specification of two
 * timestamps before and after which data is filtered out. This allows a model
 * where late data collected after the intended analysis window can be included,
 * and any late-arriving data prior to the beginning of the analysis window can
 * be removed as well. By using windowing and adding element timestamps, we can
 * do finer-grained analysis than with the {@link Exercise1} pipeline. However,
 * our batch processing is high-latency, in that we don't get results from plays
 * at the beginning of the batch's time period until the batch is processed.
 */
public class Exercise3 {

    /**
     * A transform to compute the WindowedTeamScore.
     */
    public static class WindowedTeamScore
            extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {

        private Duration duration;

        public WindowedTeamScore(Duration duration) {
            this.duration = duration;
        }

        @Override
        public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> input) {
            // [START EXERCISE 3]:
            // JavaDoc: https://cloud.google.com/dataflow/java-sdk/JavaDoc
            // Developer Docs: https://cloud.google.com/dataflow/model/windowing
            //
            return input
                    // Window.into() takes a WindowFn and returns a PTransform that
                    // applies windowing to the PCollection. FixedWindows.of() returns a
                    // WindowFn that assigns elements to windows of a fixed size. Use
                    // these methods to apply fixed windows of size
                    // this.duration to the PCollection.
                    .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
                    // Remember the ExtractAndSumScore PTransform from Exercise 1? We
                    // parameterized it over the KeyField. Use it here to compute the team
                    // scores.
                    .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
            // [END EXERCISE 3]
        }
    }

    /**
     * Run a batch pipeline to do windowed analysis of the data.
     */
    public static void main(String[] args) throws Exception {
        // Begin constructing a pipeline configured by commandline flags.
        ExerciseOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(ExerciseOptions.class);
        Pipeline pipeline = Pipeline.create(options);

        pipeline
                // Read a bounded set of generated data
                .apply(new Input.BoundedGenerator())
                // Extract and sum the windowed teamname/scores
                .apply(new WindowedTeamScore(Duration.standardMinutes(1)))
                // Write the hourly team scores to the "hourly_team_score" table
                .apply(new Output.WriteHourlyTeamScore());

        pipeline.run();
    }

    /**
     * A transform to extract key/score information from GameActionInfo, and sum
     * the scores. The constructor arg determines whether 'team' or 'user' info is
     * extracted.
     */
    private static class ExtractAndSumScore
            extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {

        private final KeyField field;

        ExtractAndSumScore(KeyField field) {
            this.field = field;
        }

        @Override
        public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> gameInfo) {
            return gameInfo
                    .apply(MapElements.via((GameActionInfo gInfo) -> KV.of(field.extract(gInfo), gInfo.getScore()))
                            .withOutputType(
                                    TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())))
                    .apply(Sum.<String>integersPerKey());
        }
    }
}