List of usage examples for org.joda.time Duration standardMinutes
public static Duration standardMinutes(long minutes)
From source file:com.google.cloud.dataflow.examples.AutoComplete.java
License:Apache License
public static void main(String[] args) throws IOException { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); if (options.isStreaming()) { // In order to cancel the pipelines automatically, // {@literal DataflowPipelineRunner} is forced to be used. options.setRunner(DataflowPipelineRunner.class); }/*from ww w . ja va 2 s .com*/ options.setBigQuerySchema(FormatForBigquery.getSchema()); DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); // We support running the same pipeline in either // batch or windowed streaming mode. PTransform<? super PBegin, PCollection<String>> readSource; WindowFn<Object, ?> windowFn; if (options.isStreaming()) { Preconditions.checkArgument(!options.getOutputToDatastore(), "DatastoreIO is not supported in streaming."); dataflowUtils.setupPubsubTopic(); readSource = PubsubIO.Read.topic(options.getPubsubTopic()); windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5)); } else { readSource = TextIO.Read.from(options.getInputFile()); windowFn = new GlobalWindows(); } // Create the pipeline. Pipeline p = Pipeline.create(options); PCollection<KV<String, List<CompletionCandidate>>> toWrite = p.apply(readSource) .apply(ParDo.of(new ExtractHashtags())).apply(Window.<String>into(windowFn)) .apply(ComputeTopCompletions.top(10, options.getRecursive())); if (options.getOutputToDatastore()) { toWrite.apply(ParDo.named("FormatForDatastore").of(new FormatForDatastore(options.getKind()))) .apply(DatastoreIO.writeTo(options.getProject())); } if (options.getOutputToBigQuery()) { dataflowUtils.setupBigQueryTable(); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); toWrite.apply(ParDo.of(new FormatForBigquery())) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatForBigquery.getSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); } // Run the pipeline. PipelineResult result = p.run(); if (options.isStreaming() && !options.getInputFile().isEmpty()) { // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic()); } // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.complete.AutoComplete.java
License:Apache License
public static void main(String[] args) throws IOException { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); if (options.isStreaming()) { // In order to cancel the pipelines automatically, // {@literal DataflowPipelineRunner} is forced to be used. options.setRunner(DataflowPipelineRunner.class); }//ww w . j av a 2s. c o m options.setBigQuerySchema(FormatForBigquery.getSchema()); DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); // We support running the same pipeline in either // batch or windowed streaming mode. PTransform<? super PBegin, PCollection<String>> readSource; WindowFn<Object, ?> windowFn; if (options.isStreaming()) { Preconditions.checkArgument(!options.getOutputToDatastore(), "DatastoreIO is not supported in streaming."); dataflowUtils.setupPubsub(); readSource = PubsubIO.Read.topic(options.getPubsubTopic()); windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5)); } else { readSource = TextIO.Read.from(options.getInputFile()); windowFn = new GlobalWindows(); } // Create the pipeline. Pipeline p = Pipeline.create(options); PCollection<KV<String, List<CompletionCandidate>>> toWrite = p.apply(readSource) .apply(ParDo.of(new ExtractHashtags())).apply(Window.<String>into(windowFn)) .apply(ComputeTopCompletions.top(10, options.getRecursive())); if (options.getOutputToDatastore()) { toWrite.apply(ParDo.named("FormatForDatastore") .of(new FormatForDatastore(options.getKind(), options.getDatastoreAncestorKey()))) .apply(DatastoreIO.v1().write().withProjectId( MoreObjects.firstNonNull(options.getOutputDataset(), options.getProject()))); } if (options.getOutputToBigQuery()) { dataflowUtils.setupBigQueryTable(); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); toWrite.apply(ParDo.of(new FormatForBigquery())) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatForBigquery.getSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition( options.isStreaming() ? BigQueryIO.Write.WriteDisposition.WRITE_APPEND : BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); } // Run the pipeline. PipelineResult result = p.run(); if (options.isStreaming() && !options.getInputFile().isEmpty()) { // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic()); } // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.complete.game.GameStats.java
License:Apache License
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);/*from w w w . j a va 2 s . c o m*/ // Allow the pipeline to be cancelled automatically. options.setRunner(DataflowPipelineRunner.class); DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); Pipeline pipeline = Pipeline.create(options); // Read Events from Pub/Sub using custom timestamps PCollection<GameActionInfo> rawEvents = pipeline .apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic())) .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn())); // Extract username/score pairs from the event stream PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) .withOutputType(new TypeDescriptor<KV<String, Integer>>() { })); // Calculate the total score per user over fixed windows, and // cumulative updates for late data. final PCollectionView<Map<String, Integer>> spammersView = userEvents .apply(Window.named("FixedWindowsUser").<KV<String, Integer>>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. // These might be robots/spammers. .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) // Derive a view from the collection of spammer users. It will be used as a side input // in calculating the team score sums, below. .apply("CreateSpammersView", View.<String, Integer>asMap()); // [START DocInclude_FilterAndCalc] // Calculate the total score per team over fixed windows, // and emit cumulative updates for late data. Uses the side input derived above-- the set of // suspected robots-- to filter out scores from those users from the sum. // Write the results to BigQuery. rawEvents .apply(Window.named("WindowIntoFixedWindows").<GameActionInfo>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out the detected spammer users, using the side input derived above. .apply(ParDo.named("FilterOutSpammers").withSideInputs(spammersView) .of(new DoFn<GameActionInfo, GameActionInfo>() { @Override public void processElement(ProcessContext c) { // If the user is not in the spammers Map, output the data element. if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { c.output(c.element()); } } })) // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new ExtractAndSumScore("team")) // [END DocInclude_FilterAndCalc] // Write the result to BigQuery .apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>( options.getTablePrefix() + "_team", configureWindowedWrite())); // [START DocInclude_SessionCalc] // Detect user sessions-- that is, a burst of activity separated by a gap from further // activity. Find and record the mean session lengths. // This information could help the game designers track the changing user engagement // as their set of games changes. userEvents .apply(Window.named("WindowIntoSessions") .<KV<String, Integer>>into( Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())) // For this use, we care only about the existence of the session, not any particular // information aggregated over it, so the following is an efficient way to do that. .apply(Combine.perKey(x -> 0)) // Get the duration per session. .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) // [END DocInclude_SessionCalc] // [START DocInclude_Rewindow] // Re-window to process groups of session sums according to when the sessions complete. .apply(Window.named("WindowToExtractSessionMean").<Integer>into( FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) // Find the mean session duration in each window. .apply(Mean.<Integer>globally().withoutDefaults()) // Write this info to a BigQuery table. .apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>( options.getTablePrefix() + "_sessions", configureSessionWindowWrite())); // [END DocInclude_Rewindow] // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.complete.game.HourlyTeamScore.java
License:Apache License
/** * Run a batch pipeline to do windowed analysis of the data. *///from w w w . j av a 2s . c om // [START DocInclude_HTSMain] public static void main(String[] args) throws Exception { // Begin constructing a pipeline configured by commandline flags. Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin())); final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin())); // Read 'gaming' events from a text file. pipeline.apply(TextIO.Read.from(options.getInput())) // Parse the incoming data. .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn())) // Filter out data before and after the given times so that it is not included // in the calculations. As we collect data in batches (say, by day), the batch for the day // that we want to analyze could potentially include some late-arriving data from the previous // day. If so, we want to weed it out. Similarly, if we include data from the following day // (to scoop up late-arriving events from the day we're analyzing), we need to weed out events // that fall after the time period we want to analyze. // [START DocInclude_HTSFilters] .apply("FilterStartTime", Filter.byPredicate( (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) .apply("FilterEndTime", Filter.byPredicate( (GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) // [END DocInclude_HTSFilters] // [START DocInclude_HTSAddTsAndWindow] // Add an element timestamp based on the event log, and apply fixed windowing. .apply("AddEventTimestamps", WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) .apply(Window.named("FixedWindowsTeam").<GameActionInfo>into( FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) // [END DocInclude_HTSAddTsAndWindow] // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamScoreSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.getTableName(), configureWindowedTableWrite())); pipeline.run(); }
From source file:com.google.cloud.dataflow.examples.complete.game.LeaderBoard.java
License:Apache License
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);/*w w w. j a v a 2 s .c o m*/ // For example purposes, allow the pipeline to be easily cancelled instead of running // continuously. options.setRunner(DataflowPipelineRunner.class); DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); Pipeline pipeline = Pipeline.create(options); // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub // data elements, and parse the data. PCollection<GameActionInfo> gameEvents = pipeline .apply(PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic())) .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn())); // [START DocInclude_WindowAndTrigger] // Extract team/score pairs from the event stream, using hour-long windows by default. gameEvents .apply(Window.named("LeaderboardTeamFixedWindows") .<GameActionInfo>into( FixedWindows.of(Duration.standardMinutes(options.getTeamWindowDuration()))) // We will get early (speculative) results as well as cumulative // processing of late data. .triggering(AfterWatermark.pastEndOfWindow() .withEarlyFirings( AfterProcessingTime.pastFirstElementInPane().plusDelayOf(FIVE_MINUTES)) .withLateFirings( AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_MINUTES))) .withAllowedLateness(Duration.standardMinutes(options.getAllowedLateness())) .accumulatingFiredPanes()) // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new ExtractAndSumScore("team")) // Write the results to BigQuery. .apply("WriteTeamScoreSums", new WriteWindowedToBigQuery<KV<String, Integer>>( options.getTableName() + "_team", configureWindowedTableWrite())); // [END DocInclude_WindowAndTrigger] // [START DocInclude_ProcTimeTrigger] // Extract user/score pairs from the event stream using processing time, via global windowing. // Get periodic updates on all users' running scores. gameEvents.apply(Window.named("LeaderboardUserGlobalWindow").<GameActionInfo>into(new GlobalWindows()) // Get periodic results every ten minutes. .triggering( Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(TEN_MINUTES))) .accumulatingFiredPanes() .withAllowedLateness(Duration.standardMinutes(options.getAllowedLateness()))) // Extract and sum username/score pairs from the event data. .apply("ExtractUserScore", new ExtractAndSumScore("user")) // Write the results to BigQuery. .apply("WriteUserScoreSums", new WriteToBigQuery<KV<String, Integer>>( options.getTableName() + "_user", configureGlobalWindowBigQueryWrite())); // [END DocInclude_ProcTimeTrigger] // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.complete.TrafficMaxLaneFlow.java
License:Apache License
/** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources *///from ww w . ja va 2 s .c o m public static void main(String[] args) throws IOException { TrafficMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficMaxLaneFlowOptions.class); options.setBigQuerySchema(FormatMaxesFn.getSchema()); // Using DataflowExampleUtils to set up required resources. DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options, options.isUnbounded()); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); PCollection<String> input; if (options.isUnbounded()) { // Read unbounded PubSubIO. input = pipeline.apply(PubsubIO.Read.timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY) .subscription(options.getPubsubSubscription())); } else { // Read bounded PubSubIO. input = pipeline.apply(PubsubIO.Read.timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY) .subscription(options.getPubsubSubscription()).maxNumRecords(VALID_INPUTS)); // To read bounded TextIO files, use: // input = pipeline.apply(new ReadFileAndExtractTimestamps(options.getInputFile())); } input // row... => <station route, station speed> ... .apply(ParDo.of(new ExtractFlowInfoFn())) // map the incoming data stream into sliding windows. The default window duration values // work well if you're running the accompanying Pub/Sub generator script with the // --replay flag, which simulates pauses in the sensor data publication. You may want to // adjust them otherwise. .apply(Window.<KV<String, LaneInfo>>into( SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new MaxLaneFlow()) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatMaxesFn.getSchema())); // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. if (!Strings.isNullOrEmpty(options.getInputFile()) && !Strings.isNullOrEmpty(options.getPubsubTopic())) { dataflowUtils.runInjectorPipeline(new ReadFileAndExtractTimestamps(options.getInputFile()), options.getPubsubTopic(), PUBSUB_TIMESTAMP_LABEL_KEY); } // Run the pipeline. PipelineResult result = pipeline.run(); // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.complete.TrafficRoutes.java
License:Apache License
/** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources *//*from www . ja v a 2s . co m*/ public static void main(String[] args) throws IOException { TrafficRoutesOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficRoutesOptions.class); options.setBigQuerySchema(FormatStatsFn.getSchema()); // Using DataflowExampleUtils to set up required resources. DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options, options.isUnbounded()); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); PCollection<String> input; if (options.isUnbounded()) { // Read unbounded PubSubIO. input = pipeline.apply(PubsubIO.Read.timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY) .subscription(options.getPubsubSubscription())); } else { // Read bounded PubSubIO. input = pipeline.apply(PubsubIO.Read.timestampLabel(PUBSUB_TIMESTAMP_LABEL_KEY) .subscription(options.getPubsubSubscription()).maxNumRecords(VALID_INPUTS)); // To read bounded TextIO files, use: // input = pipeline.apply(TextIO.Read.from(options.getInputFile())) // .apply(ParDo.of(new ExtractTimestamps())); } input // row... => <station route, station speed> ... .apply(ParDo.of(new ExtractStationSpeedFn())) // map the incoming data stream into sliding windows. // The default window duration values work well if you're running the accompanying Pub/Sub // generator script without the --replay flag, so that there are no simulated pauses in // the sensor data publication. You may want to adjust the values otherwise. .apply(Window.<KV<String, StationSpeed>>into( SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new TrackSpeed()).apply(BigQueryIO.Write.to(tableRef).withSchema(FormatStatsFn.getSchema())); // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. if (!Strings.isNullOrEmpty(options.getInputFile()) && !Strings.isNullOrEmpty(options.getPubsubTopic())) { dataflowUtils.runInjectorPipeline(new ReadFileAndExtractTimestamps(options.getInputFile()), options.getPubsubTopic(), PUBSUB_TIMESTAMP_LABEL_KEY); } // Run the pipeline. PipelineResult result = pipeline.run(); // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.TrafficMaxLaneFlow.java
License:Apache License
/** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources *//*from w w w .ja v a 2s . c om*/ public static void main(String[] args) throws IOException { TrafficMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficMaxLaneFlowOptions.class); if (options.isStreaming()) { // In order to cancel the pipelines automatically, // {@literal DataflowPipelineRunner} is forced to be used. options.setRunner(DataflowPipelineRunner.class); } options.setBigQuerySchema(FormatMaxesFn.getSchema()); // Using DataflowExampleUtils to set up required resources. DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); dataflowUtils.setup(); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); PCollection<KV<String, LaneInfo>> input; if (options.isStreaming()) { input = pipeline.apply(PubsubIO.Read.topic(options.getPubsubTopic())) // row... => <stationId, LaneInfo> ... .apply(ParDo.of(new ExtractFlowInfoFn(false /* outputTimestamp */))); } else { input = pipeline.apply(TextIO.Read.from(options.getInputFile())) // row... => <stationId, LaneInfo> ... .apply(ParDo.of(new ExtractFlowInfoFn(true /* outputTimestamp */))); } // map the incoming data stream into sliding windows. The default window duration values // work well if you're running the accompanying Pub/Sub generator script with the // --replay flag, which simulates pauses in the sensor data publication. You may want to // adjust them otherwise. input.apply(Window .<KV<String, LaneInfo>>into(SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new MaxLaneFlow()) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatMaxesFn.getSchema())); PipelineResult result = pipeline.run(); if (options.isStreaming() && !options.getInputFile().isEmpty()) { // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic()); } // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.TrafficRoutes.java
License:Apache License
/** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources *//*from ww w .j av a 2 s .com*/ public static void main(String[] args) throws IOException { TrafficRoutesOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficRoutesOptions.class); if (options.isStreaming()) { // In order to cancel the pipelines automatically, // {@literal DataflowPipelineRunner} is forced to be used. options.setRunner(DataflowPipelineRunner.class); } options.setBigQuerySchema(FormatStatsFn.getSchema()); // Using DataflowExampleUtils to set up required resources. DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); dataflowUtils.setup(); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); PCollection<KV<String, StationSpeed>> input; if (options.isStreaming()) { input = pipeline.apply(PubsubIO.Read.topic(options.getPubsubTopic())) // row... => <station route, station speed> ... .apply(ParDo.of(new ExtractStationSpeedFn(false /* outputTimestamp */))); } else { input = pipeline.apply(TextIO.Read.from(options.getInputFile())) .apply(ParDo.of(new ExtractStationSpeedFn(true /* outputTimestamp */))); } // map the incoming data stream into sliding windows. // The default window duration values work well if you're running the accompanying Pub/Sub // generator script without the --replay flag, so that there are no simulated pauses in // the sensor data publication. You may want to adjust the values otherwise. input.apply(Window.<KV<String, StationSpeed>>into( SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new TrackSpeed()).apply(BigQueryIO.Write.to(tableRef).withSchema(FormatStatsFn.getSchema())); PipelineResult result = pipeline.run(); if (options.isStreaming() && !options.getInputFile().isEmpty()) { // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic()); } // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.examples.TrafficStreamingMaxLaneFlow.java
License:Apache License
/** * Sets up and starts streaming pipeline. */// w w w . ja va2 s . c o m public static void main(String[] args) { TrafficStreamingMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficStreamingMaxLaneFlowOptions.class); DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); dataflowOptions.setStreaming(true); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(dataflowOptions.getProject()); tableRef.setDatasetId(options.getDataset()); tableRef.setTableId(options.getTable()); pipeline.apply(PubsubIO.Read.topic(options.getInputTopic())) /* map the incoming data stream into sliding windows. The default window duration values work well if you're running the accompanying PubSub generator script with the --replay flag, which simulates pauses in the sensor data publication. You may want to adjust them otherwise. */ .apply(Window.<String>into(SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new MaxLaneFlow()) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatMaxesFn.getSchema())); /* When you are done running the example, cancel your pipeline so that you do not continue to be charged for its instances. You can do this by visiting https://console.developers.google.com/project/your-project-name/dataflow/job-id in the Developers Console. You should also terminate the generator script so that you do not use unnecessary PubSub quota. */ pipeline.run(); }