List of usage examples for org.joda.time Duration standardMinutes
public static Duration standardMinutes(long minutes)
From source file:org.apache.apex.malhar.stream.sample.WindowedWordCount.java
License:Apache License
/** * Populate dag with High-Level API./*from w w w. j a v a 2 s .c om*/ * @param dag * @param conf */ @Override public void populateDAG(DAG dag, Configuration conf) { TextInput input = new TextInput(); Collector collector = new Collector(); // Create stream from the TextInput operator. ApexStream<Tuple.TimestampedTuple<String>> stream = StreamFactory .fromInput(input, input.output, name("input")) // Extract all the words from the input line of text. .flatMap(new Function.FlatMapFunction<String, String>() { @Override public Iterable<String> f(String input) { return Arrays.asList(input.split("[\\p{Punct}\\s]+")); } }, name("ExtractWords")) // Wrap the word with a randomly generated timestamp. .map(new AddTimestampFn(), name("AddTimestampFn")); // apply window and trigger option. // TODO: change trigger option to atWaterMark when available. WindowedStream<Tuple.TimestampedTuple<String>> windowedWords = stream.window( new WindowOption.TimeWindows(Duration.standardMinutes(WINDOW_SIZE)), new TriggerOption().accumulatingFiredPanes().withEarlyFiringsAtEvery(1)); WindowedStream<PojoEvent> wordCounts = // Perform a countByKey transformation to count the appearance of each word in every time window. windowedWords.countByKey(new Function.ToKeyValue<Tuple.TimestampedTuple<String>, String, Long>() { @Override public Tuple<KeyValPair<String, Long>> f(Tuple.TimestampedTuple<String> input) { return new Tuple.TimestampedTuple<KeyValPair<String, Long>>(input.getTimestamp(), new KeyValPair<String, Long>(input.getValue(), 1L)); } }, name("count words")) // Format the output and print out the result. .map(new FormatAsTableRowFn(), name("FormatAsTableRowFn")).print(name("console")); wordCounts.endWith(collector, collector.input, name("Collector")).populateDag(dag); }
From source file:org.apache.beam.examples.complete.AutoComplete.java
License:Apache License
public static void runAutocompletePipeline(Options options) throws IOException { options.setBigQuerySchema(FormatForBigquery.getSchema()); ExampleUtils exampleUtils = new ExampleUtils(options); // We support running the same pipeline in either // batch or windowed streaming mode. WindowFn<Object, ?> windowFn; if (options.isStreaming()) { checkArgument(!options.getOutputToDatastore(), "DatastoreIO is not supported in streaming."); windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5)); } else {/*from w w w.j av a2s .c o m*/ windowFn = new GlobalWindows(); } // Create the pipeline. Pipeline p = Pipeline.create(options); PCollection<KV<String, List<CompletionCandidate>>> toWrite = p .apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new ExtractHashtags())) .apply(Window.into(windowFn)).apply(ComputeTopCompletions.top(10, options.getRecursive())); if (options.getOutputToDatastore()) { toWrite.apply("FormatForDatastore", ParDo.of(new FormatForDatastore(options.getKind(), options.getDatastoreAncestorKey()))) .apply(DatastoreIO.v1().write().withProjectId( MoreObjects.firstNonNull(options.getOutputProject(), options.getProject()))); } if (options.getOutputToBigQuery()) { exampleUtils.setupBigQueryTable(); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); toWrite.apply(ParDo.of(new FormatForBigquery())) .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatForBigquery.getSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition( options.isStreaming() ? BigQueryIO.Write.WriteDisposition.WRITE_APPEND : BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); } if (options.getOutputToChecksum()) { PCollection<Long> checksum = toWrite .apply(ParDo.of(new DoFn<KV<String, List<CompletionCandidate>>, Long>() { @ProcessElement public void process(ProcessContext c) { KV<String, List<CompletionCandidate>> elm = c.element(); Long listHash = c.element().getValue().stream().mapToLong(cc -> cc.hashCode()).sum(); c.output(Long.valueOf(elm.getKey().hashCode()) + listHash); } })).apply(Sum.longsGlobally()); PAssert.that(checksum).containsInAnyOrder(options.getExpectedChecksum()); } // Run the pipeline. PipelineResult result = p.run(); // ExampleUtils will try to cancel the pipeline and the injector before the program exists. exampleUtils.waitToFinish(result); }
From source file:org.apache.beam.examples.complete.game.GameStats.java
License:Apache License
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);//from w w w .ja va 2s.co m ExampleUtils exampleUtils = new ExampleUtils(options); Pipeline pipeline = Pipeline.create(options); // Read Events from Pub/Sub using custom timestamps PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings() .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())) .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); // Extract username/score pairs from the event stream PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); // Calculate the total score per user over fixed windows, and // cumulative updates for late data. final PCollectionView<Map<String, Integer>> spammersView = userEvents .apply("FixedWindowsUser", Window.into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. // These might be robots/spammers. .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) // Derive a view from the collection of spammer users. It will be used as a side input // in calculating the team score sums, below. .apply("CreateSpammersView", View.asMap()); // [START DocInclude_FilterAndCalc] // Calculate the total score per team over fixed windows, // and emit cumulative updates for late data. Uses the side input derived above-- the set of // suspected robots-- to filter out scores from those users from the sum. // Write the results to BigQuery. rawEvents .apply("WindowIntoFixedWindows", Window.into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out the detected spammer users, using the side input derived above. .apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() { @ProcessElement public void processElement(ProcessContext c) { // If the user is not in the spammers Map, output the data element. if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { c.output(c.element()); } } }).withSideInputs(spammersView)) // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new ExtractAndSumScore("team")) // [END DocInclude_FilterAndCalc] // Write the result to BigQuery .apply("WriteTeamSums", new WriteWindowedToBigQuery<>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite())); // [START DocInclude_SessionCalc] // Detect user sessions-- that is, a burst of activity separated by a gap from further // activity. Find and record the mean session lengths. // This information could help the game designers track the changing user engagement // as their set of games changes. userEvents .apply("WindowIntoSessions", Window.<KV<String, Integer>>into( Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) .withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)) // For this use, we care only about the existence of the session, not any particular // information aggregated over it, so the following is an efficient way to do that. .apply(Combine.perKey(x -> 0)) // Get the duration per session. .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) // [END DocInclude_SessionCalc] // [START DocInclude_Rewindow] // Re-window to process groups of session sums according to when the sessions complete. .apply("WindowToExtractSessionMean", Window.into( FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) // Find the mean session duration in each window. .apply(Mean.<Integer>globally().withoutDefaults()) // Write this info to a BigQuery table. .apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite())); // [END DocInclude_Rewindow] // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); exampleUtils.waitToFinish(result); }
From source file:org.apache.beam.examples.complete.game.HourlyTeamScore.java
License:Apache License
/** Run a batch pipeline to do windowed analysis of the data. */ // [START DocInclude_HTSMain] public static void main(String[] args) throws Exception { // Begin constructing a pipeline configured by commandline flags. Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline pipeline = Pipeline.create(options); final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin())); final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin())); // Read 'gaming' events from a text file. pipeline.apply(TextIO.read().from(options.getInput())) // Parse the incoming data. .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) // Filter out data before and after the given times so that it is not included // in the calculations. As we collect data in batches (say, by day), the batch for the day // that we want to analyze could potentially include some late-arriving data from the // previous day. // If so, we want to weed it out. Similarly, if we include data from the following day // (to scoop up late-arriving events from the day we're analyzing), we need to weed out // events that fall after the time period we want to analyze. // [START DocInclude_HTSFilters] .apply("FilterStartTime", Filter.by((GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) .apply("FilterEndTime", Filter.by((GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) // [END DocInclude_HTSFilters] // [START DocInclude_HTSAddTsAndWindow] // Add an element timestamp based on the event log, and apply fixed windowing. .apply("AddEventTimestamps", WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) .apply("FixedWindowsTeam", Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) // [END DocInclude_HTSAddTsAndWindow] // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new ExtractAndSumScore("team")) .apply("WriteTeamScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), true)); pipeline.run().waitUntilFinish();/*from w w w.j a v a 2s . c o m*/ }
From source file:org.apache.beam.examples.complete.game.LeaderBoard.java
License:Apache License
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true);//from w w w .ja va 2s .co m ExampleUtils exampleUtils = new ExampleUtils(options); Pipeline pipeline = Pipeline.create(options); // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub // data elements, and parse the data. PCollection<GameActionInfo> gameEvents = pipeline.apply(PubsubIO.readStrings() .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())) .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); gameEvents .apply("CalculateTeamScores", new CalculateTeamScores(Duration.standardMinutes(options.getTeamWindowDuration()), Duration.standardMinutes(options.getAllowedLateness()))) // Write the results to BigQuery. .apply("WriteTeamScoreSums", new WriteWindowedToBigQuery<>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getLeaderBoardTableName() + "_team", configureWindowedTableWrite())); gameEvents .apply("CalculateUserScores", new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) // Write the results to BigQuery. .apply("WriteUserScoreSums", new WriteToBigQuery<>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getLeaderBoardTableName() + "_user", configureGlobalWindowBigQueryWrite())); // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); exampleUtils.waitToFinish(result); }
From source file:org.apache.beam.examples.complete.TrafficMaxLaneFlow.java
License:Apache License
public static void runTrafficMaxLaneFlow(TrafficMaxLaneFlowOptions options) throws IOException { // Using ExampleUtils to set up required resources. ExampleUtils exampleUtils = new ExampleUtils(options); exampleUtils.setup();//from w ww . j a v a 2s . c o m Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); pipeline.apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile())) // row... => <station route, station speed> ... .apply(ParDo.of(new ExtractFlowInfoFn())) // map the incoming data stream into sliding windows. .apply(Window.into(SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new MaxLaneFlow()) .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatMaxesFn.getSchema())); // Run the pipeline. PipelineResult result = pipeline.run(); // ExampleUtils will try to cancel the pipeline and the injector before the program exists. exampleUtils.waitToFinish(result); }
From source file:org.apache.beam.examples.complete.TrafficRoutes.java
License:Apache License
public static void runTrafficRoutes(TrafficRoutesOptions options) throws IOException { // Using ExampleUtils to set up required resources. ExampleUtils exampleUtils = new ExampleUtils(options); exampleUtils.setup();//from ww w. j ava 2 s . com Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); pipeline.apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile())) // row... => <station route, station speed> ... .apply(ParDo.of(new ExtractStationSpeedFn())) // map the incoming data stream into sliding windows. .apply(Window.into(SlidingWindows.of(Duration.standardMinutes(options.getWindowDuration())) .every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new TrackSpeed()) .apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(FormatStatsFn.getSchema())); // Run the pipeline. PipelineResult result = pipeline.run(); // ExampleUtils will try to cancel the pipeline and the injector before the program exists. exampleUtils.waitToFinish(result); }
From source file:org.apache.beam.examples.snippets.Snippets.java
License:Apache License
public static void fileProcessPattern() throws Exception { Pipeline p = Pipeline.create();/*from w w w . j av a 2 s . c o m*/ // [START FileProcessPatternProcessNewFilesSnip1] // This produces PCollection<MatchResult.Metadata> p.apply(FileIO.match().filepattern("...").continuously(Duration.standardSeconds(30), Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1)))); // [END FileProcessPatternProcessNewFilesSnip1] // [START FileProcessPatternProcessNewFilesSnip2] // This produces PCollection<String> p.apply(TextIO.read().from("<path-to-files>/*").watchForNewFiles( // Check for new files every minute. Duration.standardMinutes(1), // Stop watching the file pattern if no new files appear for an hour. Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1)))); // [END FileProcessPatternProcessNewFilesSnip2] // [START FileProcessPatternAccessMetadataSnip1] p.apply(FileIO.match().filepattern("hdfs://path/to/*.gz")) // The withCompression method is optional. By default, the Beam SDK detects compression from // the filename. .apply(FileIO.readMatches().withCompression(Compression.GZIP)) .apply(ParDo.of(new DoFn<FileIO.ReadableFile, String>() { @ProcessElement public void process(@Element FileIO.ReadableFile file) { // We can now access the file and its metadata. LOG.info("File Metadata resourceId is {} ", file.getMetadata().resourceId()); } })); // [END FileProcessPatternAccessMetadataSnip1] }
From source file:org.apache.beam.examples.tutorial.game.Exercise3.java
License:Apache License
/** * Run a batch pipeline to do windowed analysis of the data. *//*from ww w.jav a2s .c o m*/ public static void main(String[] args) throws Exception { // Begin constructing a pipeline configured by commandline flags. ExerciseOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(ExerciseOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline // Read a bounded set of generated data .apply(new Input.BoundedGenerator()) // Extract and sum the windowed teamname/scores .apply(new WindowedTeamScore(Duration.standardMinutes(1))) // Write the hourly team scores to the "hourly_team_score" table .apply(new Output.WriteHourlyTeamScore()); pipeline.run(); }
From source file:org.apache.beam.examples.WindowedWordCount.java
License:Apache License
static void runWindowedWordCount(Options options) throws IOException { final String output = options.getOutput(); final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); Pipeline pipeline = Pipeline.create(options); /*/*from w ww .j a v a2s. co m*/ * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or * unbounded input source. */ PCollection<String> input = pipeline /* Read from the GCS file. */ .apply(TextIO.read().from(options.getInputFile())) // Concept #2: Add an element timestamp, using an artificial time just to show // windowing. // See AddTimestampFn for more detail on this. .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp))); /* * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1 * minute (you can change this with a command-line option). See the documentation for more * information on how fixed windows work, and for information on the other types of windowing * available (e.g., sliding windows). */ PCollection<String> windowedWords = input .apply(Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); /* * Concept #4: Re-use our existing CountWords transform that does not have knowledge of * windows over a PCollection containing windowed values. */ PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords()); /* * Concept #5: Format the results and write to a sharded file partitioned by window, using a * simple ParDo operation. Because there may be failures followed by retries, the * writes must be idempotent, but the details of writing to files is elided here. */ wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())) .apply(new WriteOneFilePerWindow(output, options.getNumShards())); PipelineResult result = pipeline.run(); try { result.waitUntilFinish(); } catch (Exception exc) { result.cancel(); } }