List of usage examples for org.joda.time Duration standardMinutes
public static Duration standardMinutes(long minutes)
From source file:com.google.cloud.dataflow.examples.TrafficStreamingRoutes.java
License:Apache License
/** * Sets up and starts streaming pipeline. *///from w w w . j a v a 2 s . c om public static void main(String[] args) { TrafficStreamingRoutesOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficStreamingRoutesOptions.class); DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); dataflowOptions.setStreaming(true); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(dataflowOptions.getProject()); tableRef.setDatasetId(options.getDataset()); tableRef.setTableId(options.getTable()); pipeline.apply(PubsubIO.Read.topic(options.getInputTopic())) /* map the incoming data stream into sliding windows. The default window duration values work well if you're running the accompanying PubSub generator script without the --replay flag, so that there are no simulated pauses in the sensor data publication. You may want to adjust the values otherwise. */ .apply(Window.<String>into(SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new TrackSpeed()).apply(BigQueryIO.Write.to(tableRef).withSchema(FormatStatsFn.getSchema())); /* When you are done running the example, cancel your pipeline so that you do not continue to be charged for its instances. You can do this by visiting https://console.developers.google.com/project/your-project-name/dataflow/job-id in the Developers Console. You should also terminate the generator script so that you do not use unnecessary PubSub quota. */ pipeline.run(); }
From source file:com.google.cloud.dataflow.examples.WindowedWordCount.java
License:Apache License
public static void main(String[] args) throws IOException { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setBigQuerySchema(getSchema()); // DataflowExampleUtils creates the necessary input sources to simplify execution of this // Pipeline.//from www .j a va 2 s . c o m DataflowExampleUtils exampleDataflowUtils = new DataflowExampleUtils(options, options.isUnbounded()); Pipeline pipeline = Pipeline.create(options); /** * Concept #1: the Dataflow SDK lets us run the same pipeline with either a bounded or * unbounded input source. */ PCollection<String> input; if (options.isUnbounded()) { LOG.info("Reading from PubSub."); /** * Concept #3: Read from the PubSub topic. A topic will be created if it wasn't * specified as an argument. The data elements' timestamps will come from the pubsub * injection. */ input = pipeline.apply(PubsubIO.Read.topic(options.getPubsubTopic())); } else { /** Else, this is a bounded pipeline. Read from the GCS file. */ input = pipeline.apply(TextIO.Read.from(options.getInputFile())) // Concept #2: Add an element timestamp, using an artificial time just to show windowing. // See AddTimestampFn for more detail on this. .apply(ParDo.of(new AddTimestampFn())); } /** * Concept #4: Window into fixed windows. The fixed window size for this example defaults to 1 * minute (you can change this with a command-line option). See the documentation for more * information on how fixed windows work, and for information on the other types of windowing * available (e.g., sliding windows). */ PCollection<String> windowedWords = input .apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); /** * Concept #5: Re-use our existing CountWords transform that does not have knowledge of * windows over a PCollection containing windowed values. */ PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords()); /** * Concept #6: Format the results for a BigQuery table, then write to BigQuery. * The BigQuery output source supports both bounded and unbounded data. */ wordCounts.apply(ParDo.of(new FormatAsTableRowFn())) .apply(BigQueryIO.Write.to(getTableReference(options)).withSchema(getSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)); PipelineResult result = pipeline.run(); /** * To mock unbounded input from PubSub, we'll now start an auxiliary 'injector' pipeline that * runs for a limited time, and publishes to the input PubSub topic. * * With an unbounded input source, you will need to explicitly shut down this pipeline when you * are done with it, so that you do not continue to be charged for the instances. You can do * this via a ctrl-C from the command line, or from the developer's console UI for Dataflow * pipelines. The PubSub topic will also be deleted at this time. */ exampleDataflowUtils.mockUnboundedSource(options.getInputFile(), result); }
From source file:com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows.java
License:Apache License
static Duration getDefaultPeriod(Duration size) { if (size.isLongerThan(Duration.standardHours(1))) { return Duration.standardHours(1); }//from ww w .ja v a2 s. co m if (size.isLongerThan(Duration.standardMinutes(1))) { return Duration.standardMinutes(1); } if (size.isLongerThan(Duration.standardSeconds(1))) { return Duration.standardSeconds(1); } return Duration.millis(1); }
From source file:com.google.cloud.dataflow.starter.TrafficMaxLaneFlow.java
License:Apache License
/** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources *//*from www . ja v a 2s .co m*/ public static void main(String[] args) throws IOException { TrafficMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(TrafficMaxLaneFlowOptions.class); if (options.isStreaming()) { // In order to cancel the pipelines automatically, // {@literal DataflowPipelineRunner} is forced to be used. options.setRunner(DataflowPipelineRunner.class); } options.setBigQuerySchema(FormatMaxesFn.getSchema()); // Using DataflowExampleUtils to set up required resources. DataflowExampleUtils dataflowUtils = new DataflowExampleUtils(options); dataflowUtils.setup(); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); PCollection<KV<String, LaneInfo>> input; if (options.isStreaming()) { input = pipeline.apply(PubsubIO.Read.topic(options.getPubsubTopic())) // row... => <stationId, LaneInfo> ... .apply(ParDo.of(new ExtractFlowInfoFn(false /* outputTimestamp */))); } else { input = pipeline.apply(TextIO.Read.from(options.getInputFile())) // row... => <stationId, LaneInfo> ... .apply(ParDo.of(new ExtractFlowInfoFn(true /* outputTimestamp */))); } // map the incoming data stream into sliding windows. The default window duration values // work well if you're running the accompanying Pub/Sub generator script with the // --replay flag, which simulates pauses in the sensor data publication. You may want to // adjust them otherwise. input.apply(Window .<KV<String, LaneInfo>>into(SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new MaxLaneFlow()) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatMaxesFn.getSchema())); PipelineResult result = pipeline.run(); if (options.isStreaming() && !options.getInputFile().isEmpty()) { // Inject the data into the Pub/Sub topic with a Dataflow batch pipeline. // === UNCOMMENT IF GCE DATALOADER IS NOT RUNNING ==== dataflowUtils.runInjectorPipeline(options.getInputFile(), options.getPubsubTopic()); } // dataflowUtils will try to cancel the pipeline and the injector before the program exists. dataflowUtils.waitToFinish(result); }
From source file:com.google.cloud.dataflow.tutorials.game.Exercise2.java
License:Apache License
/** Run a batch pipeline. */ public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); tableRef.setProjectId(options.as(GcpOptions.class).getProject()); tableRef.setTableId(options.getOutputTableName()); // Read events from a CSV file and parse them. pipeline.apply(TextIO.Read.from(options.getInput())) .apply(ParDo.named("ParseGameEvent").of(new ParseEventFn())) .apply("AddEventTimestamps", WithTimestamps.of((GameEvent i) -> new Instant(i.getTimestamp()))) .apply("WindowedTeamScore", new WindowedTeamScore(Duration.standardMinutes(60))) // Write the results to BigQuery. .apply(ParDo.named("FormatTeamScoreSums").of(new FormatTeamScoreSumsFn())) .apply(BigQueryIO.Write.to(tableRef).withSchema(FormatTeamScoreSumsFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); pipeline.run();/* w ww .j a v a2 s . c om*/ }
From source file:com.google.cloud.dataflow.tutorials.game.Exercise3.java
License:Apache License
/** Run a batch or streaming pipeline. */ public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setDatasetId(options.as(Options.class).getOutputDataset()); tableRef.setProjectId(options.as(GcpOptions.class).getProject()); tableRef.setTableId(options.getOutputTableName()); // Read events from either a CSV file or PubSub stream. pipeline.apply(new ReadGameEvents(options)) .apply("WindowedTeamScore", new Exercise2.WindowedTeamScore(Duration.standardMinutes(60))) // Write the results to BigQuery. .apply(ParDo.named("FormatTeamScoreSums").of(new Exercise2.FormatTeamScoreSumsFn())) .apply(BigQueryIO.Write.to(tableRef).withSchema(Exercise2.FormatTeamScoreSumsFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); pipeline.run();/*from w w w .j ava 2 s . c om*/ }
From source file:com.google.cloud.dataflow.tutorials.game.Exercise4.java
License:Apache License
public static void main(String[] args) throws Exception { Exercise4Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise4Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);//w w w . j ava 2s. com // For example purposes, allow the pipeline to be easily cancelled instead of running // continuously. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference teamTable = new TableReference(); teamTable.setDatasetId(options.getOutputDataset()); teamTable.setProjectId(options.getProject()); teamTable.setTableId(options.getOutputTableName() + "_team"); TableReference userTable = new TableReference(); userTable.setDatasetId(options.getOutputDataset()); userTable.setProjectId(options.getProject()); userTable.setTableId(options.getOutputTableName() + "_user"); PCollection<GameEvent> gameEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); gameEvents .apply("CalculateTeamScores", new CalculateTeamScores(Duration.standardMinutes(options.getTeamWindowDuration()), Duration.standardMinutes(options.getAllowedLateness()))) // Write the results to BigQuery. .apply(ParDo.named("FormatTeamScores").of(new FormatTeamScoreFn())) .apply(BigQueryIO.Write.to(teamTable).withSchema(FormatTeamScoreFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); gameEvents .apply("CalculateUserScores", new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) // Write the results to BigQuery. .apply(ParDo.named("FormatUserScores").of(new FormatUserScoreFn())) .apply(BigQueryIO.Write.to(userTable).withSchema(FormatUserScoreFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); }
From source file:com.google.cloud.dataflow.tutorials.game.Exercise5.java
License:Apache License
public static void main(String[] args) throws Exception { Exercise5Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise5Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);/* w ww. jav a2 s . c o m*/ // Allow the pipeline to be cancelled automatically. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference teamTable = new TableReference(); teamTable.setDatasetId(options.getOutputDataset()); teamTable.setProjectId(options.getProject()); teamTable.setTableId(options.getOutputTableName()); PCollection<GameEvent> rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); // Extract username/score pairs from the event stream PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.via((GameEvent gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) .withOutputType(new TypeDescriptor<KV<String, Integer>>() { })); // Calculate the total score per user over fixed windows, and // cumulative updates for late data. final PCollectionView<Map<String, Integer>> spammersView = userEvents .apply(Window.named("FixedWindowsUser").<KV<String, Integer>>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. // These might be robots/spammers. .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) // Derive a view from the collection of spammer users. It will be used as a side input // in calculating the team score sums, below. .apply("CreateSpammersView", View.<String, Integer>asMap()); // [START EXERCISE 5 PART b]: // Calculate the total score per team over fixed windows, // and emit cumulative updates for late data. Uses the side input derived above-- the set of // suspected robots-- to filter out scores from those users from the sum. // Write the results to BigQuery. rawEvents .apply(Window.named("WindowIntoFixedWindows").<GameEvent>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out the detected spammer users, using the side input derived above. // Use ParDo with spammersView side input to filter out spammers. .apply(/* TODO: YOUR CODE GOES HERE */ new ChangeMe<PCollection<GameEvent>, GameEvent>()) // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")) // Write the result to BigQuery .apply(ParDo.named("FormatTeamWindows").of(new FormatTeamWindowFn())) .apply(BigQueryIO.Write.to(teamTable).withSchema(FormatTeamWindowFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // [START EXERCISE 5 PART b]: // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); }
From source file:com.google.cloud.dataflow.tutorials.game.Exercise8.java
License:Apache License
public static void main(String[] args) throws Exception { Exercise8Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise8Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);// ww w. j a va 2 s . c o m // Allow the pipeline to be cancelled automatically. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference badUserTable = new TableReference(); badUserTable.setDatasetId(options.getOutputDataset()); badUserTable.setProjectId(options.getProject()); badUserTable.setTableId(options.getOutputTableName() + "_bad_users"); // Read Events from Pub/Sub using custom timestamps and custom message id label. PCollection<KV<String, GameEvent>> sessionedEvents = pipeline .apply("ReadGameScoreEvents", PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).idLabel(MESSAGE_ID_ATTRIBUTE) .topic(options.getTopic())) .apply("ParseGameScoreEvents", ParDo.of(new BuggyParseEventFn())) .apply("KeyGameScoreByEventId", WithKeys.of((GameEvent event) -> event.getEventId()) .withKeyType(TypeDescriptor.of(String.class))) .apply("SessionizeGameScoreEvents", Window.<KV<String, GameEvent>>into( Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())); // Read PlayEvents from Pub/Sub using custom timestamps and custom message id label. PCollection<KV<String, PlayEvent>> sessionedPlayEvents = pipeline .apply("ReadGamePlayEvents", PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).idLabel(MESSAGE_ID_ATTRIBUTE) .topic(options.getPlayEventsTopic())) .apply("ParseGamePlayEvents", ParDo.of(new BuggyParsePlayEventFn())) .apply("KeyGamePlayByEventId", WithKeys.of((PlayEvent play) -> play.getEventId()) .withKeyType(TypeDescriptor.of(String.class))) .apply("SessionizeGamePlayEvents", Window.<KV<String, PlayEvent>>into( Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())); // Compute per-user latency. PCollection<KV<String, Long>> userLatency = KeyedPCollectionTuple.of(playTag, sessionedPlayEvents) .and(eventTag, sessionedEvents).apply("JoinScorePlayEvents", CoGroupByKey.create()) .apply("ComputeLatency", ParDo.of(new ComputeLatencyFn())); // Create a view onto quantiles of the global latency distribution. PCollectionView<List<Long>> globalQuantiles = userLatency.apply("GetLatencies", Values.create()) // Re-window session results into a global window, and trigger periodically making sure // to use the full accumulated window contents. .apply("GlobalWindowRetrigger", Window.<Long>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardSeconds(GLOBAL_AGGREGATE_TRIGGER_SEC)))) .accumulatingFiredPanes()) .apply(((Combine.Globally<Long, List<Long>>) ApproximateQuantiles .<Long>globally(GLOBAL_LATENCY_QUANTILES)).withFanout(GLOBAL_AGGREGATE_FANOUT) .asSingletonView()); userLatency // Use the computed latency distribution as a side-input to filter out likely bad users. .apply("DetectBadUsers", ParDo.withSideInputs(globalQuantiles).of(new DoFn<KV<String, Long>, String>() { public void processElement(ProcessContext c) { String user = c.element().getKey(); Long latency = c.element().getValue(); List<Long> quantiles = c.sideInput(globalQuantiles); // Users in the first quantile are considered spammers, since their // score to play event latency is too low, suggesting a robot. if (latency < quantiles.get(1)) { c.output(user); } } })) // We want to only emilt a single BigQuery row for every bad user. To do this, we // re-key by user, then window globally and trigger on the first element for each key. .apply("KeyByUser", WithKeys.of((String user) -> user).withKeyType(TypeDescriptor.of(String.class))) .apply("GlobalWindowsTriggerOnFirst", Window.<KV<String, String>>into(new GlobalWindows()) .triggering(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardSeconds(10))) .accumulatingFiredPanes()) .apply("GroupByUser", GroupByKey.<String, String>create()) .apply("FormatBadUsers", ParDo.of(new FormatBadUserFn())).apply("WriteBadUsers", BigQueryIO.Write.to(badUserTable).withSchema(FormatBadUserFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); userLatency.apply("ReKeyFn", // BUG4: We have a hot key. Especially when the cost of downstream fn is high, must // ensure we have good sharding. WithKeys.of((KV<String, Long> item) -> "").withKeyType(TypeDescriptor.of(String.class))) .apply("WindowAndTriggerOften", Window.<KV<String, KV<String, Long>>>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardSeconds(10)))) .discardingFiredPanes()) .apply("GroupByNewKey", GroupByKey.<String, KV<String, Long>>create()) .apply("DoExpensiveWork", ParDo.of(new ExpensiveWorkPerElement())); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); }
From source file:com.google.cloud.dataflow.tutorials.game.solutions.Exercise5.java
License:Apache License
public static void main(String[] args) throws Exception { Exercise5Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise5Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);//from ww w. j a v a 2 s .c om // Allow the pipeline to be cancelled automatically. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference teamTable = new TableReference(); teamTable.setDatasetId(options.getOutputDataset()); teamTable.setProjectId(options.getProject()); teamTable.setTableId(options.getOutputTableName()); PCollection<GameEvent> rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); // Extract username/score pairs from the event stream PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.via((GameEvent gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) .withOutputType(new TypeDescriptor<KV<String, Integer>>() { })); // Calculate the total score per user over fixed windows, and // cumulative updates for late data. final PCollectionView<Map<String, Integer>> spammersView = userEvents .apply(Window.named("FixedWindowsUser").<KV<String, Integer>>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. // These might be robots/spammers. .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) // Derive a view from the collection of spammer users. It will be used as a side input // in calculating the team score sums, below. .apply("CreateSpammersView", View.<String, Integer>asMap()); // Calculate the total score per team over fixed windows, // and emit cumulative updates for late data. Uses the side input derived above-- the set of // suspected robots-- to filter out scores from those users from the sum. // Write the results to BigQuery. rawEvents .apply(Window.named("WindowIntoFixedWindows").<GameEvent>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out the detected spammer users, using the side input derived above. .apply(ParDo.named("FilterOutSpammers").withSideInputs(spammersView) .of(new DoFn<GameEvent, GameEvent>() { @Override public void processElement(ProcessContext c) { // If the user is not in the spammers Map, output the data element. if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { c.output(c.element()); } } })) // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new Exercise1.ExtractAndSumScore("team")) // Write the result to BigQuery .apply(ParDo.named("FormatTeamWindows").of(new FormatTeamWindowFn())) .apply(BigQueryIO.Write.to(teamTable).withSchema(FormatTeamWindowFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); }