com.google.cloud.dataflow.tutorials.game.Exercise2.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.dataflow.tutorials.game.Exercise2.java

Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.tutorials.game;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition;
import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.GcpOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.WithTimestamps;
import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.tutorials.game.utils.ChangeMe;
import com.google.cloud.dataflow.tutorials.game.utils.GameEvent;
import com.google.cloud.dataflow.tutorials.game.utils.Options;
import com.google.cloud.dataflow.tutorials.game.utils.ParseEventFn;
import java.util.ArrayList;
import java.util.List;
import org.joda.time.Duration;
import org.joda.time.Instant;

/**
 * Second in a series of coding exercises in a gaming domain.
 *
 * <p>This batch pipeline calculates the sum of scores per team per hour, over an entire batch of
 * gaming data and writes the per-team sums to BigQuery.
 *
 * <p>See README.md for details.
 */
public class Exercise2 {
    /** A transform to compute the WindowedTeamScore. */
    public static class WindowedTeamScore
            extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> {

        private Duration duration;

        public WindowedTeamScore(Duration duration) {
            this.duration = duration;
        }

        @Override
        public PCollection<KV<String, Integer>> apply(PCollection<GameEvent> input) {
            // [START EXERCISE 2]:
            // JavaDoc: https://cloud.google.com/dataflow/java-sdk/JavaDoc
            // Developer Docs: https://cloud.google.com/dataflow/model/windowing
            //
            return input
                    // Window.into() takes a WindowFn and returns a PTransform that
                    // applies windowing to the PCollection. FixedWindows.of() returns a
                    // WindowFn that assigns elements to windows of a fixed size. Use
                    // these methods to apply fixed windows of size
                    // this.duration to the PCollection.
                    .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */)
                    // Remember the ExtractAndSumScore PTransform from Exercise 1? We
                    // parameterized it over the key field. Use it here to compute the "team"
                    // scores.
                    .apply(new ChangeMe<>() /* TODO: YOUR CODE GOES HERE */);
            // [END EXERCISE 2]
        }
    }

    /** Format a KV of team and their score to a BigQuery TableRow. */
    public static class FormatTeamScoreSumsFn extends DoFn<KV<String, Integer>, TableRow>
            implements RequiresWindowAccess {
        @Override
        public void processElement(ProcessContext c) {
            TableRow row = new TableRow().set("team", c.element().getKey())
                    .set("total_score", c.element().getValue())
                    .set("window_start", ((IntervalWindow) c.window()).start().getMillis() / 1000);
            c.output(row);
        }

        /** Defines the BigQuery schema. */
        public static TableSchema getSchema() {
            List<TableFieldSchema> fields = new ArrayList<>();
            fields.add(new TableFieldSchema().setName("team").setType("STRING"));
            fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER"));
            fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP"));
            return new TableSchema().setFields(fields);
        }
    }

    /** Run a batch pipeline. */
    public static void main(String[] args) throws Exception {
        Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
        Pipeline pipeline = Pipeline.create(options);

        TableReference tableRef = new TableReference();
        tableRef.setDatasetId(options.as(Options.class).getOutputDataset());
        tableRef.setProjectId(options.as(GcpOptions.class).getProject());
        tableRef.setTableId(options.getOutputTableName());

        // Read events from a CSV file and parse them.
        pipeline.apply(TextIO.Read.from(options.getInput()))
                .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn()))
                .apply("AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp())))
                .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60)))
                // Write the results to BigQuery.
                .apply(ParDo.named("FormatTeamScoreSums").of(new FormatTeamScoreSumsFn()))
                .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatTeamScoreSumsFn.getSchema())
                        .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
                        .withWriteDisposition(WriteDisposition.WRITE_APPEND));

        pipeline.run();
    }
}