List of usage examples for org.joda.time Duration standardMinutes
public static Duration standardMinutes(long minutes)
From source file:com.google.cloud.dataflow.tutorials.game.solutions.Exercise6.java
License:Apache License
public static void main(String[] args) throws Exception { Exercise6Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise6Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);//from ww w. ja v a 2 s . c o m // Allow the pipeline to be cancelled automatically. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference sessionsTable = new TableReference(); sessionsTable.setDatasetId(options.getOutputDataset()); sessionsTable.setProjectId(options.getProject()); sessionsTable.setTableId(options.getOutputTableName()); PCollection<GameEvent> rawEvents = pipeline.apply(new Exercise3.ReadGameEvents(options)); // Extract username/score pairs from the event stream PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.via((GameEvent gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) .withOutputType(new TypeDescriptor<KV<String, Integer>>() { })); // Detect user sessions-- that is, a burst of activity separated by a gap from further // activity. Find and record the mean session lengths. // This information could help the game designers track the changing user engagement // as their set of games changes. userEvents .apply(Window.named("WindowIntoSessions") .<KV<String, Integer>>into( Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())) // For this use, we care only about the existence of the session, not any particular // information aggregated over it, so the following is an efficient way to do that. .apply(Combine.perKey(x -> 0)) // Get the duration per session. .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) // Re-window to process groups of session sums according to when the sessions complete. .apply(Window.named("WindowToExtractSessionMean").<Integer>into( FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) // Find the mean session duration in each window. .apply(Mean.<Integer>globally().withoutDefaults()) // Write this info to a BigQuery table. .apply(ParDo.named("FormatSessions").of(new FormatSessionWindowFn())) .apply(BigQueryIO.Write.to(sessionsTable).withSchema(FormatSessionWindowFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); }
From source file:com.google.cloud.dataflow.tutorials.game.solutions.Exercise7.java
License:Apache License
public static void main(String[] args) throws Exception { Exercise7Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Exercise7Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);/*w w w.j a v a 2 s. co m*/ // Allow the pipeline to be cancelled automatically. options.setRunner(DataflowPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); TableReference badUserTable = new TableReference(); badUserTable.setDatasetId(options.getOutputDataset()); badUserTable.setProjectId(options.getProject()); badUserTable.setTableId(options.getOutputTableName() + "_bad_users"); // Read Events from Pub/Sub using custom timestamps and custom message id label. PCollection<KV<String, GameEvent>> sessionedEvents = pipeline .apply("ReadGameScoreEvents", PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).idLabel(MESSAGE_ID_ATTRIBUTE) .topic(options.getTopic())) .apply("ParseGameScoreEvents", ParDo.of(new ParseEventFn())) .apply("KeyGameScoreByEventId", WithKeys.of((GameEvent event) -> event.getEventId()) .withKeyType(TypeDescriptor.of(String.class))) .apply("SessionizeGameScoreEvents", Window.<KV<String, GameEvent>>into( Sessions.withGapDuration(Duration.standardMinutes(SESSION_GAP_MINUTES))) .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())); // Read PlayEvents from Pub/Sub using custom timestamps and custom message id label. PCollection<KV<String, PlayEvent>> sessionedPlayEvents = pipeline .apply("ReadGamePlayEvents", PubsubIO.Read.timestampLabel(TIMESTAMP_ATTRIBUTE).idLabel(MESSAGE_ID_ATTRIBUTE) .topic(options.getPlayEventsTopic())) .apply("ParseGamePlayEvents", ParDo.of(new ParsePlayEventFn())) .apply("KeyGamePlayByEventId", WithKeys.of((PlayEvent play) -> play.getEventId()) .withKeyType(TypeDescriptor.of(String.class))) .apply("SessionizeGamePlayEvents", Window.<KV<String, PlayEvent>>into( Sessions.withGapDuration(Duration.standardMinutes(SESSION_GAP_MINUTES))) .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())); // Compute per-user latency. PCollection<KV<String, Long>> userLatency = KeyedPCollectionTuple.of(playTag, sessionedPlayEvents) .and(eventTag, sessionedEvents).apply("JoinScorePlayEvents", CoGroupByKey.create()) .apply("ComputeLatency", ParDo.of(new ComputeLatencyFn())); // Create a view onto quantiles of the global latency distribution. PCollectionView<List<Long>> globalQuantiles = userLatency.apply("GetLatencies", Values.create()) // Re-window session results into a global window, and trigger periodically making sure // to use the full accumulated window contents. .apply("GlobalWindowRetrigger", Window.<Long>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(100))) .accumulatingFiredPanes()) .apply(((Combine.Globally<Long, List<Long>>) ApproximateQuantiles .<Long>globally(GLOBAL_LATENCY_QUANTILES)).withFanout(GLOBAL_AGGREGATE_FANOUT) .asSingletonView()); userLatency // Use the computed latency distribution as a side-input to filter out likely bad users. .apply("DetectBadUsers", ParDo.withSideInputs(globalQuantiles).of(new DoFn<KV<String, Long>, String>() { public void processElement(ProcessContext c) { String user = c.element().getKey(); Long latency = c.element().getValue(); List<Long> quantiles = c.sideInput(globalQuantiles); // Users in the first quantile are considered spammers, since their // score to play event latency is too low, suggesting a robot. if (latency < quantiles.get(1)) { c.output(user); } } })) // We want to only emilt a single BigQuery row for every bad user. To do this, we // re-key by user, then window globally and trigger on the first element for each key. .apply("KeyByUser", WithKeys.of((String user) -> user).withKeyType(TypeDescriptor.of(String.class))) .apply("GlobalWindowsTriggerOnFirst", Window.<KV<String, String>>into(new GlobalWindows()) .triggering(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardSeconds(10))) .accumulatingFiredPanes()) .apply("GroupByUser", GroupByKey.<String, String>create()) .apply("FormatBadUsers", ParDo.of(new FormatBadUserFn())).apply("WriteBadUsers", BigQueryIO.Write.to(badUserTable).withSchema(FormatBadUserFn.getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); }
From source file:com.google.cloud.sparkdemo.HourlyTeamScore.java
License:Apache License
/** * Run a batch pipeline./*from w ww .j ava2 s . c o m*/ **/ public static void main(String[] args) throws Exception { HourlyTeamScoreOptions options = new HourlyTeamScoreOptions(); options.parse(args); SparkConf sc = new SparkConf().setAppName("HourlyTeamScore"); JavaSparkContext jsc = new JavaSparkContext(sc); Configuration hadoopConf = jsc.hadoopConfiguration(); configureBigQueryOutput(hadoopConf, options.getProject(), options.getDataset(), options.getTableName(), options.getTableSchema()); final Long startMinTimestamp = timestampParser.parseMillis(options.getStartMin()); final Long stopMinTimestamp = timestampParser.parseMillis(options.getStopMin()); final Long windowDuration = Duration.standardMinutes(options.getWindowDuration()).getMillis(); // Run a pipeline to analyze all the data in batch. // First, read events from a text file and parse them. JavaRDD<GameActionInfo> gameEvents = jsc.textFile(options.getInput()).flatMap(new ParseEventFn()) // Filter out data before and after the given times so that it is not included // in the calculations. As we collect data in batches (say, by day), the batch for // the day that we want to analyze could potentially include some late-arriving // data from the previous day. If so, we want to weed it out. Similarly, if we include // data from the following day (to scoop up late-arriving events from the day we're // analyzing), we need to weed out events that fall after the time period we want to // analyze. .filter((GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp) .filter((GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp); JavaPairRDD<WithTimestamp<String>, Integer> hourlyTeamScores = gameEvents.mapToPair(event -> new Tuple2<>( // Extract the composite key as <team, window_start_time> WithTimestamp.create(event.getTeam(), // Apply Fixed Window by rounding the timestamp down to the nearest // multiple of the window size (event.getTimestamp() / windowDuration) * windowDuration), // Extract the scores as values event.getScore())) // Compute the sum of the scores per team per window .reduceByKey(new SumScore()); // Write to a BigQuery table JavaPairRDD<String, JsonObject> jsonPairs = hourlyTeamScores.mapToPair(convertToJson); jsonPairs.saveAsNewAPIHadoopDataset(hadoopConf); }
From source file:com.google.cloud.training.dataanalyst.flights.PredictRealtime.java
License:Apache License
@SuppressWarnings("serial") public static void main(String[] args) { // create pipeline from options MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class); boolean streaming = options.getInput().contains("/topics/"); if (streaming) { LOG.info("Creating real-time pipeline that reads from Pub/Sub I/O"); options.setStreaming(true);/*from ww w.ja v a 2 s . c o m*/ options.setRunner(DataflowPipelineRunner.class); } Pipeline p = Pipeline.create(options); // read delays-*.csv into memory for use as a side-input PCollectionView<Map<String, Double>> delays = getAverageDelays(p, options.getDelayPath()); // read flights, either batch or in 1-hr windows every minute PCollection<String> lines; if (streaming) { // real-time for pub-sub lines = p.apply("ReadLines", PubsubIO.Read.topic(options.getInput())) // .apply("window", Window.into(SlidingWindows// .of(Duration.standardMinutes(60))// .every(Duration.standardMinutes(1)))); } else { // batch, from text lines = p.apply("ReadLines", TextIO.Read.from(options.getInput())); } PCollection<Flight> flights = lines.apply("ParseFlights", ParDo.withSideInputs(delays).of(new ParseFlights(delays, streaming))) // ; PCollectionView<Map<String, Double>> arrDelay = flights .apply("airport:hour", ParDo.of(new DoFn<Flight, KV<String, Double>>() { @Override public void processElement(ProcessContext c) throws Exception { Flight f = c.element(); if (f.arrHour != ParseFlights.INVALID_HOUR) { String key = "arr_" + f.toAirport + ":" + f.date + ":" + f.arrHour; double value = f.arrivalDelay; c.output(KV.of(key, value)); } } })) // .apply(Mean.perKey()) // .apply(View.asMap()); PCollection<String> pred = flights.apply("Predict", ParDo.withSideInputs(arrDelay).of(new DoFn<Flight, String>() { // FIXME: distribute predictions to different machines transient TensorflowModel tfModel = new TensorflowModel(options.getModelfile(), options.getGraphfile()); @Override public void processElement(ProcessContext c) throws Exception { Flight f = c.element(); if (f.arrHour == ParseFlights.INVALID_HOUR) { // don't know when this flight is arriving, so predict ... f = f.newCopy(); // get average arrival delay String key = "arr_" + f.toAirport + ":" + f.date + ":" + (f.depHour - 1); Double delay = c.sideInput(arrDelay).get(key); f.averageArrivalDelay = (delay == null) ? 0 : delay; // predict boolean ontime = tfModel.predict(f.getInputFeatures()) > 0.5; // output c.output(f.line + "," + ontime); } } })); if (streaming) { pred.apply("WriteFlights", PubsubIO.Write.topic(options.getOutput())); } else { pred.apply("WriteFlights", TextIO.Write.to(options.getOutput() + "flights").withSuffix(".csv")); } p.run(); }
From source file:com.google.cloud.training.dataanalyst.flights.PredictRealtime.java
License:Apache License
public static Instant toInstant(String date, String hourmin) { // e.g: 2015-01-01 and 0837 int hrmin = Integer.parseInt(hourmin); int hr = hrmin / 100; int min = hrmin % 100; return Instant.parse(date) // .plus(Duration.standardHours(hr)) // .plus(Duration.standardMinutes(min)); }
From source file:com.google.cloud.training.dataanalyst.javahelp.f1_11_streaming_pipeline.java
License:Apache License
@SuppressWarnings("serial") public static void main(String[] args) { MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class); options.setStreaming(true);//from www . j av a 2 s.c o m Pipeline p = Pipeline.create(options); String topic = options.getInput(); String output = options.getOutput(); // Build the table schema for the output table. List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("timestamp").setType("TIMESTAMP")); fields.add(new TableFieldSchema().setName("num_words").setType("INTEGER")); TableSchema schema = new TableSchema().setFields(fields); p // .apply("GetMessages", PubsubIO.Read.topic(topic)) // .apply("window", Window.into(SlidingWindows// .of(Duration.standardMinutes(2))// .every(Duration.standardSeconds(30)))) // .apply("WordsPerLine", ParDo.of(new DoFn<String, Integer>() { @Override public void processElement(ProcessContext c) throws Exception { String line = c.element(); c.output(line.split(" ").length); } }))// .apply("WordsInTimeWindow", Sum.integersGlobally().withoutDefaults()) // .apply("ToBQRow", ParDo.of(new DoFn<Integer, TableRow>() { @Override public void processElement(ProcessContext c) throws Exception { TableRow row = new TableRow(); row.set("timestamp", new Date().getTime()); row.set("num_words", c.element()); c.output(row); } })) // .apply(BigQueryIO.Write.to(output)// .withSchema(schema)// .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); p.run(); }
From source file:com.google.cloud.training.dataanalyst.javahelp.StreamDemoConsumer.java
License:Apache License
@SuppressWarnings("serial") public static void main(String[] args) { MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class); options.setRunner(DataflowPipelineRunner.class); options.setStreaming(true);/*from w w w.j ava 2 s . c o m*/ Pipeline p = Pipeline.create(options); String topic = options.getInput(); String output = options.getOutput(); p // .apply("GetMessages", PubsubIO.Read.topic(topic)) // .apply("window", Window.into(SlidingWindows// .of(Duration.standardMinutes(2))// .every(Duration.standardSeconds(30)))) // .apply("LineLength", ParDo.of(new DoFn<String, Integer>() { @Override public void processElement(ProcessContext c) throws Exception { String line = c.element(); c.output(line.length()); } }))// .apply(Sum.integersGlobally().withoutDefaults()) // .apply("ToString", ParDo.of(new DoFn<Integer, String>() { @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element().toString()); } })) // .apply(PubsubIO.Write.topic(output)); p.run(); }
From source file:com.google.codelabs.dataflow.ExactDollarRides.java
License:Apache License
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts").withCoder(TableRowJsonCoder.of())) .apply("extract dollars", MapElements.via((TableRow x) -> Double.parseDouble(x.get("meter_increment").toString())) .withOutputType(TypeDescriptor.of(Double.class))) .apply("fixed window", Window.into(FixedWindows.of(Duration.standardMinutes(1)))) .apply("trigger", Window .<Double>triggering(AfterWatermark.pastEndOfWindow() .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane() .plusDelayOf(Duration.standardSeconds(1))) .withLateFirings(AfterPane.elementCountAtLeast(1))) .accumulatingFiredPanes().withAllowedLateness(Duration.standardMinutes(5))) .apply("sum whole window", Sum.doublesGlobally().withoutDefaults()) .apply("format rides", ParDo.of(new TransformRides())) .apply(PubsubIO.Write//from w ww . j a va2s. com .named("WriteToPubsub").topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
From source file:com.google.codelabs.dataflow.LatestRides.java
License:Apache License
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts").withCoder(TableRowJsonCoder.of())) .apply("key rides by rideid", MapElements.via((TableRow ride) -> KV.of(ride.get("ride_id").toString(), ride)) .withOutputType(new TypeDescriptor<KV<String, TableRow>>() { }))/*from ww w . j a v a2s.co m*/ .apply("session windows on rides with early firings", Window .<KV<String, TableRow>>into(Sessions.withGapDuration(Duration.standardMinutes(60))) .triggering(AfterWatermark.pastEndOfWindow().withEarlyFirings( AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(2000)))) .accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)) .apply("group ride points on same ride", Combine.perKey(new LatestPointCombine())) .apply("discard key", MapElements.via((KV<String, TableRow> a) -> a.getValue()) .withOutputType(TypeDescriptor.of(TableRow.class))) .apply(PubsubIO.Write .named("WriteToPubsub").topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
From source file:com.google.codelabs.dataflow.PickupRides.java
License:Apache License
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts").withCoder(TableRowJsonCoder.of())) .apply("key rides by rideid", MapElements.via((TableRow ride) -> KV.of(ride.get("ride_id").toString(), ride)) .withOutputType(new TypeDescriptor<KV<String, TableRow>>() { }))/*from w w w .j a v a2 s. c om*/ .apply("session windows on rides with early firings", Window .<KV<String, TableRow>>into(Sessions.withGapDuration(Duration.standardMinutes(1))) .triggering(AfterWatermark.pastEndOfWindow().withEarlyFirings( AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(1000)))) .accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)) .apply("group ride points on same ride", Combine.perKey(new PickupPointCombine())) .apply("discard key", MapElements.via((KV<String, TableRow> a) -> a.getValue()) .withOutputType(TypeDescriptor.of(TableRow.class))) .apply("filter if no pickup", Filter.byPredicate((TableRow a) -> a.get("ride_status").equals("pickup"))) .apply(PubsubIO.Write .named("WriteToPubsub").topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }