com.google.cloud.dataflow.examples.WindowedWordCount.java Source code

Introduction

Here is the source code for com.google.cloud.dataflow.examples.WindowedWordCount.java
Source

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.examples;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.dataflow.examples.common.DataflowExampleOptions;
import com.google.cloud.dataflow.examples.common.DataflowExampleUtils;
import com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions;
import com.google.cloud.dataflow.examples.common.ExamplePubsubTopicOptions;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.PipelineResult;
import com.google.cloud.dataflow.sdk.io.BigQueryIO;
import com.google.cloud.dataflow.sdk.io.PubsubIO;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;

import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * An example that counts words in text, and can run over either unbounded or bounded input
 * collections.
 *
 * <p>This class, {@link WindowedWordCount}, is the last in a series of four successively more
 * detailed 'word count' examples. First take a look at {@link MinimalWordCount},
 * {@link WordCount}, and {@link DebuggingWordCount}.
 *
 * <p>Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
 * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally
 * and using the Dataflow service; defining DoFns; creating a custom aggregator;
 * user-defined PTransforms; defining PipelineOptions.
 *
 * <p>New Concepts:
 * <pre>
 *   1. Unbounded and bounded pipeline input modes
 *   2. Adding timestamps to data
 *   3. PubSub topics as sources
 *   4. Windowing
 *   5. Re-using PTransforms over windowed PCollections
 *   6. Writing to BigQuery
 * </pre>
 *
 * <p>To execute this pipeline locally, specify general pipeline configuration:
 * <pre>{@code
 *   --project=YOUR_PROJECT_ID
 * }
 * </pre>
 *
 * <p>To execute this pipeline using the Dataflow service, specify pipeline configuration:
 * <pre>{@code
 *   --project=YOUR_PROJECT_ID
 *   --stagingLocation=gs://YOUR_STAGING_DIRECTORY
 *   --runner=BlockingDataflowPipelineRunner
 * }
 * </pre>
 *
 * <p>Optionally specify the input file path via:
 * {@code --inputFile=gs://INPUT_PATH},
 * which defaults to {@code gs://dataflow-samples/shakespeare/kinglear.txt}.
 *
 * <p>Specify an output BigQuery dataset and optionally, a table for the output. If you don't
 * specify the table, one will be created for you using the job name. If you don't specify the
 * dataset, a dataset called {@code dataflow-examples} must already exist in your project.
 * {@code --bigQueryDataset=YOUR-DATASET --bigQueryTable=YOUR-NEW-TABLE-NAME}.
 *
 * <p>Decide whether you want your pipeline to run with 'bounded' (such as files in GCS) or
 * 'unbounded' input (such as a PubSub topic). To run with unbounded input, set
 * {@code --unbounded=true}. Then, optionally specify the Google Cloud PubSub topic to read from
 * via {@code --pubsubTopic=projects/PROJECT_ID/topics/YOUR_TOPIC_NAME}. If the topic does not
 * exist, the pipeline will create one for you. It will delete this topic when it terminates.
 * The pipeline will automatically launch an auxiliary batch pipeline to populate the given PubSub
 * topic with the contents of the {@code --inputFile}, in order to make the example easy to run.
 * If you want to use an independently-populated PubSub topic, indicate this by setting
 * {@code --inputFile=""}. In that case, the auxiliary pipeline will not be started.
 *
 * <p>By default, the pipeline will do fixed windowing, on 1-minute windows.  You can
 * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10}
 * for 10-minute windows.
 */
public class WindowedWordCount {
    private static final Logger LOG = LoggerFactory.getLogger(WindowedWordCount.class);
    static final int WINDOW_SIZE = 1; // Default window duration in minutes

    /**
     * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for
     * this example, for the bounded data case.
     *
     * <p>Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate
     * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a
     * 2-hour period.
     */
    static class AddTimestampFn extends DoFn<String, String> {
        private static final Duration RAND_RANGE = Duration.standardHours(2);
        private final Instant minTimestamp;

        AddTimestampFn() {
            this.minTimestamp = new Instant(System.currentTimeMillis());
        }

        @Override
        public void processElement(ProcessContext c) {
            // Generate a timestamp that falls somewhere in the past two hours.
            long randMillis = (long) (Math.random() * RAND_RANGE.getMillis());
            Instant randomTimestamp = minTimestamp.plus(randMillis);
            /**
             * Concept #2: Set the data element with that timestamp.
             */
            c.outputWithTimestamp(c.element(), new Instant(randomTimestamp));
        }
    }

    /** A DoFn that converts a Word and Count into a BigQuery table row. */
    static class FormatAsTableRowFn extends DoFn<KV<String, Long>, TableRow> {
        @Override
        public void processElement(ProcessContext c) {
            TableRow row = new TableRow().set("word", c.element().getKey()).set("count", c.element().getValue())
                    // include a field for the window timestamp
                    .set("window_timestamp", c.timestamp().toString());
            c.output(row);
        }
    }

    /**
     * Helper method that defines the BigQuery schema used for the output.
     */
    private static TableSchema getSchema() {
        List<TableFieldSchema> fields = new ArrayList<>();
        fields.add(new TableFieldSchema().setName("word").setType("STRING"));
        fields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
        fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
        TableSchema schema = new TableSchema().setFields(fields);
        return schema;
    }

    /**
     * Concept #6: We'll stream the results to a BigQuery table. The BigQuery output source is one
     * that supports both bounded and unbounded data. This is a helper method that creates a
     * TableReference from input options, to tell the pipeline where to write its BigQuery results.
     */
    private static TableReference getTableReference(Options options) {
        TableReference tableRef = new TableReference();
        tableRef.setProjectId(options.getProject());
        tableRef.setDatasetId(options.getBigQueryDataset());
        tableRef.setTableId(options.getBigQueryTable());
        return tableRef;
    }

    /**
     * Options supported by {@link WindowedWordCount}.
     *
     * <p>Inherits standard example configuration options, which allow specification of the BigQuery
     * table and the PubSub topic, as well as the {@link WordCount.WordCountOptions} support for
     * specification of the input file.
     */
    public static interface Options extends WordCount.WordCountOptions, DataflowExampleOptions,
            ExamplePubsubTopicOptions, ExampleBigQueryTableOptions {
        @Description("Fixed window duration, in minutes")
        @Default.Integer(WINDOW_SIZE)
        Integer getWindowSize();

        void setWindowSize(Integer value);

        @Description("Whether to run the pipeline with unbounded input")
        boolean isUnbounded();

        void setUnbounded(boolean value);
    }

    public static void main(String[] args) throws IOException {
        Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
        options.setBigQuerySchema(getSchema());
        // DataflowExampleUtils creates the necessary input sources to simplify execution of this
        // Pipeline.
        DataflowExampleUtils exampleDataflowUtils = new DataflowExampleUtils(options, options.isUnbounded());

        Pipeline pipeline = Pipeline.create(options);

        /**
         * Concept #1: the Dataflow SDK lets us run the same pipeline with either a bounded or
         * unbounded input source.
         */
        PCollection<String> input;
        if (options.isUnbounded()) {
            LOG.info("Reading from PubSub.");
            /**
             * Concept #3: Read from the PubSub topic. A topic will be created if it wasn't
             * specified as an argument. The data elements' timestamps will come from the pubsub
             * injection.
             */
            input = pipeline.apply(PubsubIO.Read.topic(options.getPubsubTopic()));
        } else {
            /** Else, this is a bounded pipeline. Read from the GCS file. */
            input = pipeline.apply(TextIO.Read.from(options.getInputFile()))
                    // Concept #2: Add an element timestamp, using an artificial time just to show windowing.
                    // See AddTimestampFn for more detail on this.
                    .apply(ParDo.of(new AddTimestampFn()));
        }

        /**
         * Concept #4: Window into fixed windows. The fixed window size for this example defaults to 1
         * minute (you can change this with a command-line option). See the documentation for more
         * information on how fixed windows work, and for information on the other types of windowing
         * available (e.g., sliding windows).
         */
        PCollection<String> windowedWords = input
                .apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));

        /**
         * Concept #5: Re-use our existing CountWords transform that does not have knowledge of
         * windows over a PCollection containing windowed values.
         */
        PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());

        /**
         * Concept #6: Format the results for a BigQuery table, then write to BigQuery.
         * The BigQuery output source supports both bounded and unbounded data.
         */
        wordCounts.apply(ParDo.of(new FormatAsTableRowFn()))
                .apply(BigQueryIO.Write.to(getTableReference(options)).withSchema(getSchema())
                        .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                        .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));

        PipelineResult result = pipeline.run();

        /**
         * To mock unbounded input from PubSub, we'll now start an auxiliary 'injector' pipeline that
         * runs for a limited time, and publishes to the input PubSub topic.
         *
         * With an unbounded input source, you will need to explicitly shut down this pipeline when you
         * are done with it, so that you do not continue to be charged for the instances. You can do
         * this via a ctrl-C from the command line, or from the developer's console UI for Dataflow
         * pipelines. The PubSub topic will also be deleted at this time.
         */
        exampleDataflowUtils.mockUnboundedSource(options.getInputFile(), result);
    }
}