com.google.cloud.training.dataanalyst.sandiego.AverageSpeeds.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.training.dataanalyst.sandiego.AverageSpeeds.java

Source

/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.training.dataanalyst.sandiego;

import java.util.ArrayList;
import java.util.List;

import org.apache.beam.runners.dataflow.options.DataflowPipelineOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.io.PubsubIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Mean;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.appengine.repackaged.org.joda.time.Instant;

/**
 * A dataflow pipeline that computes average speeds in each lane
 * 
 * @author vlakshmanan
 *
 */
public class AverageSpeeds {

    public static interface MyOptions extends DataflowPipelineOptions {
        @Description("Over how long a time period should we average? (in minutes)")
        @Default.Double(60.0)
        Double getAveragingInterval();

        void setAveragingInterval(Double d);

        @Description("Simulation speedup factor. Use 1.0 if no speedup")
        @Default.Double(60.0)
        Double getSpeedupFactor();

        void setSpeedupFactor(Double d);
    }

    @SuppressWarnings("serial")
    public static void main(String[] args) {
        MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class);
        options.setStreaming(true);
        Pipeline p = Pipeline.create(options);

        String topic = "projects/" + options.getProject() + "/topics/sandiego";
        String avgSpeedTable = options.getProject() + ":demos.average_speeds";

        // if we need to average over 60 minutes and speedup is 30x
        // then we need to average over 2 minutes of sped-up stream
        Duration averagingInterval = Duration
                .millis(Math.round(1000 * 60 * (options.getAveragingInterval() / options.getSpeedupFactor())));
        Duration averagingFrequency = averagingInterval.dividedBy(2); // 2 times
        // in
        // window
        System.out.println("Averaging interval = " + averagingInterval);
        System.out.println("Averaging freq = " + averagingFrequency);

        // Build the table schema for the output table.
        List<TableFieldSchema> fields = new ArrayList<>();
        fields.add(new TableFieldSchema().setName("timestamp").setType("TIMESTAMP"));
        fields.add(new TableFieldSchema().setName("latitude").setType("FLOAT"));
        fields.add(new TableFieldSchema().setName("longitude").setType("FLOAT"));
        fields.add(new TableFieldSchema().setName("highway").setType("STRING"));
        fields.add(new TableFieldSchema().setName("direction").setType("STRING"));
        fields.add(new TableFieldSchema().setName("lane").setType("INTEGER"));
        fields.add(new TableFieldSchema().setName("speed").setType("FLOAT"));
        fields.add(new TableFieldSchema().setName("sensorId").setType("STRING"));
        TableSchema schema = new TableSchema().setFields(fields);

        PCollection<LaneInfo> laneInfo = p //
                .apply("GetMessages", PubsubIO.<String>read().topic(topic).withCoder(StringUtf8Coder.of())) //
                .apply("TimeWindow", Window.into(SlidingWindows//
                        .of(averagingInterval)//
                        .every(averagingFrequency))) //
                .apply("ExtractData", ParDo.of(new DoFn<String, LaneInfo>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) throws Exception {
                        String line = c.element();
                        c.output(LaneInfo.newLaneInfo(line));
                    }
                }));

        PCollection<KV<String, Double>> avgSpeed = laneInfo //
                .apply("BySensor", ParDo.of(new DoFn<LaneInfo, KV<String, Double>>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) throws Exception {
                        LaneInfo info = c.element();
                        String key = info.getSensorKey();
                        Double speed = info.getSpeed();
                        c.output(KV.of(key, speed));
                    }
                })) //
                .apply("AvgBySensor", Mean.perKey());

        avgSpeed.apply("ToBQRow", ParDo.of(new DoFn<KV<String, Double>, TableRow>() {
            @ProcessElement
            public void processElement(ProcessContext c) throws Exception {
                TableRow row = new TableRow();
                String stationKey = c.element().getKey();
                Double speed = c.element().getValue();
                String line = Instant.now().toString() + "," + stationKey + "," + speed; // CSV
                LaneInfo info = LaneInfo.newLaneInfo(line);
                row.set("timestamp", info.getTimestamp());
                row.set("latitude", info.getLatitude());
                row.set("longitude", info.getLongitude());
                row.set("highway", info.getHighway());
                row.set("direction", info.getDirection());
                row.set("lane", info.getLane());
                row.set("speed", info.getSpeed());
                row.set("sensorId", info.getSensorKey());
                c.output(row);
            }
        })) //
                .apply(BigQueryIO.Write.to(avgSpeedTable)//
                        .withSchema(schema)//
                        .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
                        .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

        p.run();
    }
}