Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.snippets; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.api.services.bigquery.model.TimePartitioning; import java.util.ArrayList; import java.util.Arrays; import java.util.Base64; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.avro.generic.GenericRecord; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.DefaultCoder; import org.apache.beam.sdk.coders.DoubleCoder; import org.apache.beam.sdk.io.Compression; import org.apache.beam.sdk.io.FileIO; import org.apache.beam.sdk.io.GenerateSequence; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations; import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord; import org.apache.beam.sdk.io.gcp.bigquery.TableDestination; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Sum; import org.apache.beam.sdk.transforms.View; import org.apache.beam.sdk.transforms.Watch; import org.apache.beam.sdk.transforms.join.CoGbkResult; import org.apache.beam.sdk.transforms.join.CoGroupByKey; import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple; import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; import org.apache.beam.sdk.transforms.windowing.FixedWindows; import org.apache.beam.sdk.transforms.windowing.GlobalWindows; import org.apache.beam.sdk.transforms.windowing.IntervalWindow; import org.apache.beam.sdk.transforms.windowing.Repeatedly; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.transforms.windowing.WindowFn; import org.apache.beam.sdk.transforms.windowing.WindowFn.MergeContext; import org.apache.beam.sdk.transforms.windowing.WindowMappingFn; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TimestampedValue; import org.apache.beam.sdk.values.TupleTag; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.sdk.values.TypeDescriptors; import org.apache.beam.sdk.values.ValueInSingleWindow; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; import org.joda.time.Duration; import org.joda.time.Instant; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** Code snippets used in webdocs. */ public class Snippets { @DefaultCoder(AvroCoder.class) static class Quote { final String source; final String quote; public Quote() { this.source = ""; this.quote = ""; } public Quote(String source, String quote) { this.source = source; this.quote = quote; } } @DefaultCoder(AvroCoder.class) static class WeatherData { final long year; final long month; final long day; final double maxTemp; public WeatherData() { this.year = 0; this.month = 0; this.day = 0; this.maxTemp = 0.0f; } public WeatherData(long year, long month, long day, double maxTemp) { this.year = year; this.month = month; this.day = day; this.maxTemp = maxTemp; } } /** Using a Read and Write transform to read/write from/to BigQuery. */ public static void modelBigQueryIO(Pipeline p) { modelBigQueryIO(p, "", "", ""); } public static void modelBigQueryIO(Pipeline p, String writeProject, String writeDataset, String writeTable) { { // [START BigQueryTableSpec] String tableSpec = "clouddataflow-readonly:samples.weather_stations"; // [END BigQueryTableSpec] } { // [START BigQueryTableSpecWithoutProject] String tableSpec = "samples.weather_stations"; // [END BigQueryTableSpecWithoutProject] } { // [START BigQueryTableSpecObject] TableReference tableSpec = new TableReference().setProjectId("clouddataflow-readonly") .setDatasetId("samples").setTableId("weather_stations"); // [END BigQueryTableSpecObject] } { // [START BigQueryDataTypes] TableRow row = new TableRow(); row.set("string", "abc"); byte[] rawbytes = { (byte) 0xab, (byte) 0xac }; row.set("bytes", new String(Base64.getEncoder().encodeToString(rawbytes))); row.set("integer", 5); row.set("float", 0.5); row.set("numeric", 5); row.set("boolean", true); row.set("timestamp", "2018-12-31 12:44:31.744957 UTC"); row.set("date", "2018-12-31"); row.set("time", "12:44:31"); row.set("datetime", "2019-06-11T14:44:31"); row.set("geography", "POINT(30 10)"); // [END BigQueryDataTypes] } { String tableSpec = "clouddataflow-readonly:samples.weather_stations"; // [START BigQueryReadTable] PCollection<Double> maxTemperatures = p.apply(BigQueryIO.readTableRows().from(tableSpec)) // Each row is of type TableRow .apply(MapElements.into(TypeDescriptors.doubles()) .via((TableRow row) -> (Double) row.get("max_temperature"))); // [END BigQueryReadTable] } { String tableSpec = "clouddataflow-readonly:samples.weather_stations"; // [START BigQueryReadFunction] PCollection<Double> maxTemperatures = p.apply( BigQueryIO.read((SchemaAndRecord elem) -> (Double) elem.getRecord().get("max_temperature")) .from(tableSpec).withCoder(DoubleCoder.of())); // [END BigQueryReadFunction] } { // [START BigQueryReadQuery] PCollection<Double> maxTemperatures = p.apply(BigQueryIO .read((SchemaAndRecord elem) -> (Double) elem.getRecord().get("max_temperature")) .fromQuery("SELECT max_temperature FROM [clouddataflow-readonly:samples.weather_stations]") .withCoder(DoubleCoder.of())); // [END BigQueryReadQuery] } { // [START BigQueryReadQueryStdSQL] PCollection<Double> maxTemperatures = p.apply(BigQueryIO .read((SchemaAndRecord elem) -> (Double) elem.getRecord().get("max_temperature")) .fromQuery("SELECT max_temperature FROM `clouddataflow-readonly.samples.weather_stations`") .usingStandardSql().withCoder(DoubleCoder.of())); // [END BigQueryReadQueryStdSQL] } // [START BigQuerySchemaJson] String tableSchemaJson = "" + "{" + " \"fields\": [" + " {" + " \"name\": \"source\"," + " \"type\": \"STRING\"," + " \"mode\": \"NULLABLE\"" + " }," + " {" + " \"name\": \"quote\"," + " \"type\": \"STRING\"," + " \"mode\": \"REQUIRED\"" + " }" + " ]" + "}"; // [END BigQuerySchemaJson] { String tableSpec = "clouddataflow-readonly:samples.weather_stations"; if (!writeProject.isEmpty() && !writeDataset.isEmpty() && !writeTable.isEmpty()) { tableSpec = writeProject + ":" + writeDataset + "." + writeTable; } // [START BigQuerySchemaObject] TableSchema tableSchema = new TableSchema().setFields( ImmutableList.of(new TableFieldSchema().setName("source").setType("STRING").setMode("NULLABLE"), new TableFieldSchema().setName("quote").setType("STRING").setMode("REQUIRED"))); // [END BigQuerySchemaObject] // [START BigQueryWriteInput] /* @DefaultCoder(AvroCoder.class) static class Quote { final String source; final String quote; public Quote() { this.source = ""; this.quote = ""; } public Quote(String source, String quote) { this.source = source; this.quote = quote; } } */ PCollection<Quote> quotes = p.apply(Create.of(new Quote("Mahatma Gandhi", "My life is my message."), new Quote("Yoda", "Do, or do not. There is no 'try'."))); // [END BigQueryWriteInput] // [START BigQueryWriteTable] quotes.apply(MapElements.into(TypeDescriptor.of(TableRow.class)) .via((Quote elem) -> new TableRow().set("source", elem.source).set("quote", elem.quote))) .apply(BigQueryIO.writeTableRows().to(tableSpec).withSchema(tableSchema) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)); // [END BigQueryWriteTable] // [START BigQueryWriteFunction] quotes.apply(BigQueryIO.<Quote>write().to(tableSpec).withSchema(tableSchema) .withFormatFunction( (Quote elem) -> new TableRow().set("source", elem.source).set("quote", elem.quote)) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)); // [END BigQueryWriteFunction] // [START BigQueryWriteJsonSchema] quotes.apply(BigQueryIO.<Quote>write().to(tableSpec).withJsonSchema(tableSchemaJson) .withFormatFunction( (Quote elem) -> new TableRow().set("source", elem.source).set("quote", elem.quote)) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)); // [END BigQueryWriteJsonSchema] } { // [START BigQueryWriteDynamicDestinations] /* @DefaultCoder(AvroCoder.class) static class WeatherData { final long year; final long month; final long day; final double maxTemp; public WeatherData() { this.year = 0; this.month = 0; this.day = 0; this.maxTemp = 0.0f; } public WeatherData(long year, long month, long day, double maxTemp) { this.year = year; this.month = month; this.day = day; this.maxTemp = maxTemp; } } */ PCollection<WeatherData> weatherData = p.apply(BigQueryIO.read((SchemaAndRecord elem) -> { GenericRecord record = elem.getRecord(); return new WeatherData((Long) record.get("year"), (Long) record.get("month"), (Long) record.get("day"), (Double) record.get("max_temperature")); }).fromQuery("SELECT year, month, day, max_temperature " + "FROM [clouddataflow-readonly:samples.weather_stations] " + "WHERE year BETWEEN 2007 AND 2009").withCoder(AvroCoder.of(WeatherData.class))); // We will send the weather data into different tables for every year. weatherData.apply(BigQueryIO.<WeatherData>write().to(new DynamicDestinations<WeatherData, Long>() { @Override public Long getDestination(ValueInSingleWindow<WeatherData> elem) { return elem.getValue().year; } @Override public TableDestination getTable(Long destination) { return new TableDestination(new TableReference().setProjectId(writeProject) .setDatasetId(writeDataset).setTableId(writeTable + "_" + destination), "Table for year " + destination); } @Override public TableSchema getSchema(Long destination) { return new TableSchema().setFields(ImmutableList.of( new TableFieldSchema().setName("year").setType("INTEGER").setMode("REQUIRED"), new TableFieldSchema().setName("month").setType("INTEGER").setMode("REQUIRED"), new TableFieldSchema().setName("day").setType("INTEGER").setMode("REQUIRED"), new TableFieldSchema().setName("maxTemp").setType("FLOAT").setMode("NULLABLE"))); } }).withFormatFunction((WeatherData elem) -> new TableRow().set("year", elem.year) .set("month", elem.month).set("day", elem.day).set("maxTemp", elem.maxTemp)) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)); // [END BigQueryWriteDynamicDestinations] String tableSpec = "clouddataflow-readonly:samples.weather_stations"; if (!writeProject.isEmpty() && !writeDataset.isEmpty() && !writeTable.isEmpty()) { tableSpec = writeProject + ":" + writeDataset + "." + writeTable + "_partitioning"; } TableSchema tableSchema = new TableSchema().setFields( ImmutableList.of(new TableFieldSchema().setName("year").setType("INTEGER").setMode("REQUIRED"), new TableFieldSchema().setName("month").setType("INTEGER").setMode("REQUIRED"), new TableFieldSchema().setName("day").setType("INTEGER").setMode("REQUIRED"), new TableFieldSchema().setName("maxTemp").setType("FLOAT").setMode("NULLABLE"))); // [START BigQueryTimePartitioning] weatherData .apply(BigQueryIO.<WeatherData>write().to(tableSpec + "_partitioning").withSchema(tableSchema) .withFormatFunction((WeatherData elem) -> new TableRow().set("year", elem.year) .set("month", elem.month).set("day", elem.day).set("maxTemp", elem.maxTemp)) // NOTE: an existing table without time partitioning set up will not work .withTimePartitioning(new TimePartitioning().setType("DAY")) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)); // [END BigQueryTimePartitioning] } } /** Helper function to format results in coGroupByKeyTuple. */ public static String formatCoGbkResults(String name, Iterable<String> emails, Iterable<String> phones) { List<String> emailsList = new ArrayList<>(); for (String elem : emails) { emailsList.add("'" + elem + "'"); } Collections.sort(emailsList); String emailsStr = "[" + String.join(", ", emailsList) + "]"; List<String> phonesList = new ArrayList<>(); for (String elem : phones) { phonesList.add("'" + elem + "'"); } Collections.sort(phonesList); String phonesStr = "[" + String.join(", ", phonesList) + "]"; return name + "; " + emailsStr + "; " + phonesStr; } /** Using a CoGroupByKey transform. */ public static PCollection<String> coGroupByKeyTuple(TupleTag<String> emailsTag, TupleTag<String> phonesTag, PCollection<KV<String, String>> emails, PCollection<KV<String, String>> phones) { // [START CoGroupByKeyTuple] PCollection<KV<String, CoGbkResult>> results = KeyedPCollectionTuple.of(emailsTag, emails) .and(phonesTag, phones).apply(CoGroupByKey.create()); PCollection<String> contactLines = results.apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() { @ProcessElement public void processElement(ProcessContext c) { KV<String, CoGbkResult> e = c.element(); String name = e.getKey(); Iterable<String> emailsIter = e.getValue().getAll(emailsTag); Iterable<String> phonesIter = e.getValue().getAll(phonesTag); String formattedResult = Snippets.formatCoGbkResults(name, emailsIter, phonesIter); c.output(formattedResult); } })); // [END CoGroupByKeyTuple] return contactLines; } public static void fileProcessPattern() throws Exception { Pipeline p = Pipeline.create(); // [START FileProcessPatternProcessNewFilesSnip1] // This produces PCollection<MatchResult.Metadata> p.apply(FileIO.match().filepattern("...").continuously(Duration.standardSeconds(30), Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1)))); // [END FileProcessPatternProcessNewFilesSnip1] // [START FileProcessPatternProcessNewFilesSnip2] // This produces PCollection<String> p.apply(TextIO.read().from("<path-to-files>/*").watchForNewFiles( // Check for new files every minute. Duration.standardMinutes(1), // Stop watching the file pattern if no new files appear for an hour. Watch.Growth.afterTimeSinceNewOutput(Duration.standardHours(1)))); // [END FileProcessPatternProcessNewFilesSnip2] // [START FileProcessPatternAccessMetadataSnip1] p.apply(FileIO.match().filepattern("hdfs://path/to/*.gz")) // The withCompression method is optional. By default, the Beam SDK detects compression from // the filename. .apply(FileIO.readMatches().withCompression(Compression.GZIP)) .apply(ParDo.of(new DoFn<FileIO.ReadableFile, String>() { @ProcessElement public void process(@Element FileIO.ReadableFile file) { // We can now access the file and its metadata. LOG.info("File Metadata resourceId is {} ", file.getMetadata().resourceId()); } })); // [END FileProcessPatternAccessMetadataSnip1] } private static final Logger LOG = LoggerFactory.getLogger(Snippets.class); // [START SideInputPatternSlowUpdateGlobalWindowSnip1] public static void sideInputPatterns() { // This pipeline uses View.asSingleton for a placeholder external service. // Run in debug mode to see the output. Pipeline p = Pipeline.create(); // Create a side input that updates each second. PCollectionView<Map<String, String>> map = p .apply(GenerateSequence.from(0).withRate(1, Duration.standardSeconds(5L))) .apply(Window.<Long>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane())) .discardingFiredPanes()) .apply(ParDo.of(new DoFn<Long, Map<String, String>>() { @ProcessElement public void process(@Element Long input, OutputReceiver<Map<String, String>> o) { // Replace map with test data from the placeholder external service. // Add external reads here. o.output(PlaceholderExternalService.readTestData()); } })).apply(View.asSingleton()); // Consume side input. GenerateSequence generates test data. // Use a real source (like PubSubIO or KafkaIO) in production. p.apply(GenerateSequence.from(0).withRate(1, Duration.standardSeconds(1L))) .apply(Window.into(FixedWindows.of(Duration.standardSeconds(1)))) .apply(Sum.longsGlobally().withoutDefaults()).apply(ParDo.of(new DoFn<Long, KV<Long, Long>>() { @ProcessElement public void process(ProcessContext c) { Map<String, String> keyMap = c.sideInput(map); c.outputWithTimestamp(KV.of(1L, c.element()), Instant.now()); LOG.debug("Value is {}, key A is {}, and key B is {}.", c.element(), keyMap.get("Key_A"), keyMap.get("Key_B")); } }).withSideInputs(map)); } /** Placeholder class that represents an external service generating test data. */ public static class PlaceholderExternalService { public static Map<String, String> readTestData() { Map<String, String> map = new HashMap<>(); Instant now = Instant.now(); DateTimeFormatter dtf = DateTimeFormat.forPattern("HH:MM:SS"); map.put("Key_A", now.minus(Duration.standardSeconds(30)).toString(dtf)); map.put("Key_B", now.minus(Duration.standardSeconds(30)).toString()); return map; } } // [END SideInputPatternSlowUpdateGlobalWindowSnip1] // [START AccessingValueProviderInfoAfterRunSnip1] /** Sample of PipelineOptions with a ValueProvider option argument. */ public interface MyOptions extends PipelineOptions { @Description("My option") @Default.String("Hello world!") ValueProvider<String> getStringValue(); void setStringValue(ValueProvider<String> value); } public static void accessingValueProviderInfoAfterRunSnip1(String[] args) { MyOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(MyOptions.class); // Create pipeline. Pipeline p = Pipeline.create(options); // Add a branch for logging the ValueProvider value. p.apply(Create.of(1)).apply(ParDo.of(new DoFn<Integer, Integer>() { // Define the DoFn that logs the ValueProvider value. @ProcessElement public void process(ProcessContext c) { MyOptions ops = c.getPipelineOptions().as(MyOptions.class); // This example logs the ValueProvider value, but you could store it by // pushing it to an external database. LOG.info("Option StringValue was {}", ops.getStringValue()); } })); // The main pipeline. p.apply(Create.of(1, 2, 3, 4)).apply(Sum.integersGlobally()); p.run(); } // [END AccessingValueProviderInfoAfterRunSnip1] private static final Duration gapDuration = Duration.standardSeconds(10L); // [START CustomSessionWindow1] public Collection<IntervalWindow> assignWindows(WindowFn.AssignContext c) { // Assign each element into a window from its timestamp until gapDuration in the // future. Overlapping windows (representing elements within gapDuration of // each other) will be merged. return Arrays.asList(new IntervalWindow(c.timestamp(), gapDuration)); } // [END CustomSessionWindow1] // [START CustomSessionWindow2] public static class DynamicSessions extends WindowFn<TableRow, IntervalWindow> { /** Duration of the gaps between sessions. */ private final Duration gapDuration; /** Creates a {@code DynamicSessions} {@link WindowFn} with the specified gap duration. */ private DynamicSessions(Duration gapDuration) { this.gapDuration = gapDuration; } // [END CustomSessionWindow2] // [START CustomSessionWindow3] @Override public Collection<IntervalWindow> assignWindows(AssignContext c) { // Assign each element into a window from its timestamp until gapDuration in the // future. Overlapping windows (representing elements within gapDuration of // each other) will be merged. Duration dataDrivenGap; TableRow message = c.element(); try { dataDrivenGap = Duration.standardSeconds(Long.parseLong(message.get("gap").toString())); } catch (Exception e) { dataDrivenGap = gapDuration; } return Arrays.asList(new IntervalWindow(c.timestamp(), dataDrivenGap)); } // [END CustomSessionWindow3] // [START CustomSessionWindow4] /** Creates a {@code DynamicSessions} {@link WindowFn} with the specified gap duration. */ public static DynamicSessions withDefaultGapDuration(Duration gapDuration) { return new DynamicSessions(gapDuration); } // [START CustomSessionWindow4] @Override public void mergeWindows(MergeContext c) throws Exception { } @Override public boolean isCompatible(WindowFn<?, ?> other) { return false; } @Override public Coder<IntervalWindow> windowCoder() { return null; } @Override public WindowMappingFn<IntervalWindow> getDefaultWindowMappingFn() { return null; } } public static class CustomSessionPipeline { public static void main(String[] args) { // [START CustomSessionWindow5] PCollection<TableRow> p = Pipeline.create().apply("Create data", Create.timestamped( TimestampedValue.of(new TableRow().set("user", "mobile").set("score", 12).set("gap", 5), new Instant()), TimestampedValue.of(new TableRow().set("user", "desktop").set("score", 4), new Instant()), TimestampedValue.of(new TableRow().set("user", "mobile").set("score", -3).set("gap", 5), new Instant().plus(2000)), TimestampedValue.of(new TableRow().set("user", "mobile").set("score", 2).set("gap", 5), new Instant().plus(9000)), TimestampedValue.of(new TableRow().set("user", "mobile").set("score", 7).set("gap", 5), new Instant().plus(12000)), TimestampedValue.of(new TableRow().set("user", "desktop").set("score", 10), new Instant().plus(12000)))); // [END CustomSessionWindow5] // [START CustomSessionWindow6] p.apply("Window into sessions", Window.<TableRow>into(DynamicSessions.withDefaultGapDuration(Duration.standardSeconds(10)))); // [END CustomSessionWindow6] } } }