Java tutorial
/* * Copyright (C) 2015 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.google.cloud.dataflow.tutorials.game; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.cloud.dataflow.sdk.Pipeline; import com.google.cloud.dataflow.sdk.PipelineResult; import com.google.cloud.dataflow.sdk.io.BigQueryIO; import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.CreateDisposition; import com.google.cloud.dataflow.sdk.io.BigQueryIO.Write.WriteDisposition; import com.google.cloud.dataflow.sdk.options.Default; import com.google.cloud.dataflow.sdk.options.Description; import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; import com.google.cloud.dataflow.sdk.options.StreamingOptions; import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner; import com.google.cloud.dataflow.sdk.transforms.DoFn; import com.google.cloud.dataflow.sdk.transforms.DoFn.RequiresWindowAccess; import com.google.cloud.dataflow.sdk.transforms.PTransform; import com.google.cloud.dataflow.sdk.transforms.ParDo; import com.google.cloud.dataflow.sdk.transforms.windowing.IntervalWindow; import com.google.cloud.dataflow.sdk.values.KV; import com.google.cloud.dataflow.sdk.values.PCollection; import com.google.cloud.dataflow.tutorials.game.solutions.Exercise1; import com.google.cloud.dataflow.tutorials.game.solutions.Exercise3; import com.google.cloud.dataflow.tutorials.game.utils.GameEvent; import com.google.cloud.dataflow.tutorials.game.utils.Options; import com.google.common.annotations.VisibleForTesting; import java.util.ArrayList; import java.util.List; import org.joda.time.Duration; import org.joda.time.Instant; /** * Fourth in a series of coding exercises in a gaming domain. * * <p>This streaming pipeline calculates user and team scores for a window of time and writes them * to BigQuery. * * <p>See README.md for details. */ public class Exercise4 { static final Duration TEN_SECONDS = Duration.standardSeconds(10); static final Duration THIRTY_SECONDS = Duration.standardSeconds(30); /** Exercise4Options supported by {@link Exercise4}. */ interface Exercise4Options extends Options, StreamingOptions { @Description("Numeric value of fixed window duration for team analysis, in minutes") @Default.Integer(1) Integer getTeamWindowDuration(); void setTeamWindowDuration(Integer value); @Description("Numeric value of allowed data lateness, in minutes") @Default.Integer(2) Integer getAllowedLateness(); void setAllowedLateness(Integer value); } /** * Extract user/score pairs from the event stream using processing time, via global windowing. Get * periodic updates on all users' running scores. */ @VisibleForTesting static class CalculateUserScores extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> { private final Duration allowedLateness; CalculateUserScores(Duration allowedLateness) { this.allowedLateness = allowedLateness; } @Override public PCollection<KV<String, Integer>> apply(PCollection<GameEvent> input) { // [START EXERCISE 4 PART 1]: // JavaDoc: https://cloud.google.com/dataflow/java-sdk/JavaDoc // Developer Docs: https://cloud.google.com/dataflow/model/par-do // // Fill in the code to: // 1. Window the incoming input into global windows // 2. trigger every thirty seconds to emit speculative results. return input /* TODO: SOLUTION CODE HERE */ // Extract and sum username/score pairs from the event data. .apply("ExtractUserScore", new Exercise1.ExtractAndSumScore("user")); // [END EXERCISE 4 PART 1]: } } /** Calculates scores for each team within the configured window duration. */ // Extract team/score pairs from the event stream, using hour-long windows by default. @VisibleForTesting static class CalculateTeamScores extends PTransform<PCollection<GameEvent>, PCollection<KV<String, Integer>>> { private final Duration teamWindowDuration; private final Duration allowedLateness; CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) { this.teamWindowDuration = teamWindowDuration; this.allowedLateness = allowedLateness; } @Override public PCollection<KV<String, Integer>> apply(PCollection<GameEvent> infos) { // [START EXERCISE 4 PART 2]: // JavaDoc: https://cloud.google.com/dataflow/java-sdk/JavaDoc // Developer Docs: https://cloud.google.com/dataflow/model/par-do // // Fill in the code to: // 1. Window the incoming input into fixed windows of team window duration // 2. trigger on time results at the watermark // 3. trigger speculative results every ten seconds // 4. trigger late data results with a delay of thirty seconds return infos /* TODO: SOLUTION CODE HERE */ // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")); // [END EXERCISE 4 PART 2]: } } public static void main(String[] args) throws Exception { Exercise4Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise4Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true); // For example purposes, allow the pipeline to be easily cancelled instead of running // continuously. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference teamTable = new TableReference(); teamTable.setDatasetId(options.getOutputDataset()); teamTable.setProjectId(options.getProject()); teamTable.setTableId(options.getOutputTableName() + "_team"); TableReference userTable = new TableReference(); userTable.setDatasetId(options.getOutputDataset()); userTable.setProjectId(options.getProject()); userTable.setTableId(options.getOutputTableName() + "_user"); PCollection<GameEvent> gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); gameEvents .apply("CalculateTeamScores", new CalculateTeamScores(Duration.standardMinutes(options.getTeamWindowDuration()), Duration.standardMinutes(options.getAllowedLateness()))) // Write the results to BigQuery. .apply(ParDo.named("FormatTeamScores").of(new FormatTeamScoreFn())) .apply(BigQueryIO.Write.to(teamTable).withSchema(FormatTeamScoreFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); gameEvents .apply("CalculateUserScores", new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) // Write the results to BigQuery. .apply(ParDo.named("FormatUserScores").of(new FormatUserScoreFn())) .apply(BigQueryIO.Write.to(userTable).withSchema(FormatUserScoreFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); } /** Format a KV of team and associated properties to a BigQuery TableRow. */ protected static class FormatTeamScoreFn extends DoFn<KV<String, Integer>, TableRow> implements RequiresWindowAccess { @Override public void processElement(ProcessContext c) { TableRow row = new TableRow().set("team", c.element().getKey()) .set("total_score", c.element().getValue()) .set("window_start", ((IntervalWindow) c.window()).start().getMillis() / 1000) .set("processing_time", Instant.now().getMillis() / 1000) .set("timing", c.pane().getTiming().toString()); c.output(row); } static TableSchema getSchema() { List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("team").setType("STRING")); fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); fields.add(new TableFieldSchema().setName("window_start").setType("TIMESTAMP")); fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); fields.add(new TableFieldSchema().setName("timing").setType("STRING")); return new TableSchema().setFields(fields); } } /** Format a KV of user and associated properties to a BigQuery TableRow. */ static class FormatUserScoreFn extends DoFn<KV<String, Integer>, TableRow> { @Override public void processElement(ProcessContext c) { TableRow row = new TableRow().set("user", c.element().getKey()) .set("total_score", c.element().getValue()) .set("processing_time", Instant.now().getMillis() / 1000); c.output(row); } static TableSchema getSchema() { List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("user").setType("STRING")); fields.add(new TableFieldSchema().setName("total_score").setType("INTEGER")); fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); return new TableSchema().setFields(fields); } } }