Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.streaming.api.datastream; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.SerializationException; import org.apache.commons.lang3.SerializationUtils; import org.apache.flink.api.common.functions.FilterFunction; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.Function; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.functions.ReduceFunction; import org.apache.flink.api.common.functions.RichFilterFunction; import org.apache.flink.api.common.functions.RichFlatMapFunction; import org.apache.flink.api.common.functions.RichMapFunction; import org.apache.flink.api.common.functions.RichReduceFunction; import org.apache.flink.api.common.typeinfo.BasicArrayTypeInfo; import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.api.java.tuple.Tuple; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.TupleTypeInfo; import org.apache.flink.streaming.api.JobGraphBuilder; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.function.aggregation.AggregationFunction; import org.apache.flink.streaming.api.function.aggregation.AggregationFunction.AggregationType; import org.apache.flink.streaming.api.function.aggregation.ComparableAggregator; import org.apache.flink.streaming.api.function.aggregation.SumAggregator; import org.apache.flink.streaming.api.function.sink.PrintSinkFunction; import org.apache.flink.streaming.api.function.sink.SinkFunction; import org.apache.flink.streaming.api.function.sink.WriteFormatAsCsv; import org.apache.flink.streaming.api.function.sink.WriteFormatAsText; import org.apache.flink.streaming.api.function.sink.WriteSinkFunctionByBatches; import org.apache.flink.streaming.api.function.sink.WriteSinkFunctionByMillis; import org.apache.flink.streaming.api.invokable.SinkInvokable; import org.apache.flink.streaming.api.invokable.StreamInvokable; import org.apache.flink.streaming.api.invokable.operator.CounterInvokable; import org.apache.flink.streaming.api.invokable.operator.FilterInvokable; import org.apache.flink.streaming.api.invokable.operator.FlatMapInvokable; import org.apache.flink.streaming.api.invokable.operator.MapInvokable; import org.apache.flink.streaming.api.invokable.operator.StreamReduceInvokable; import org.apache.flink.streaming.api.invokable.util.DefaultTimeStamp; import org.apache.flink.streaming.api.invokable.util.TimeStamp; import org.apache.flink.streaming.partitioner.BroadcastPartitioner; import org.apache.flink.streaming.partitioner.DistributePartitioner; import org.apache.flink.streaming.partitioner.FieldsPartitioner; import org.apache.flink.streaming.partitioner.ForwardPartitioner; import org.apache.flink.streaming.partitioner.ShufflePartitioner; import org.apache.flink.streaming.partitioner.StreamPartitioner; import org.apache.flink.streaming.util.keys.FieldsKeySelector; import org.apache.flink.streaming.util.keys.PojoKeySelector; import org.apache.flink.streaming.util.serialization.FunctionTypeWrapper; import org.apache.flink.streaming.util.serialization.ObjectTypeWrapper; import org.apache.flink.streaming.util.serialization.TypeWrapper; /** * A DataStream represents a stream of elements of the same type. A DataStream * can be transformed into another DataStream by applying a transformation as * for example * <ul> * <li>{@link DataStream#map},</li> * <li>{@link DataStream#filter}, or</li> * <li>{@link DataStream#aggregate}.</li> * </ul> * * @param <OUT> * The type of the DataStream, i.e., the type of the elements of the * DataStream. */ public class DataStream<OUT> { protected static Integer counter = 0; protected final StreamExecutionEnvironment environment; protected final String id; protected int degreeOfParallelism; protected List<String> userDefinedNames; protected boolean selectAll; protected StreamPartitioner<OUT> partitioner; protected final TypeWrapper<OUT> outTypeWrapper; protected List<DataStream<OUT>> mergedStreams; protected final JobGraphBuilder jobGraphBuilder; /** * Create a new {@link DataStream} in the given execution environment with * partitioning set to forward by default. * * @param environment * StreamExecutionEnvironment * @param operatorType * The type of the operator in the component * @param outTypeWrapper * Type of the output */ public DataStream(StreamExecutionEnvironment environment, String operatorType, TypeWrapper<OUT> outTypeWrapper) { if (environment == null) { throw new NullPointerException("context is null"); } counter++; this.id = operatorType + "-" + counter.toString(); this.environment = environment; this.degreeOfParallelism = environment.getDegreeOfParallelism(); this.jobGraphBuilder = environment.getJobGraphBuilder(); this.userDefinedNames = new ArrayList<String>(); this.selectAll = false; this.partitioner = new ForwardPartitioner<OUT>(); this.outTypeWrapper = outTypeWrapper; this.mergedStreams = new ArrayList<DataStream<OUT>>(); this.mergedStreams.add(this); } /** * Create a new DataStream by creating a copy of another DataStream * * @param dataStream * The DataStream that will be copied. */ public DataStream(DataStream<OUT> dataStream) { this.environment = dataStream.environment; this.id = dataStream.id; this.degreeOfParallelism = dataStream.degreeOfParallelism; this.userDefinedNames = new ArrayList<String>(dataStream.userDefinedNames); this.selectAll = dataStream.selectAll; this.partitioner = dataStream.partitioner; this.jobGraphBuilder = dataStream.jobGraphBuilder; this.outTypeWrapper = dataStream.outTypeWrapper; this.mergedStreams = new ArrayList<DataStream<OUT>>(); this.mergedStreams.add(this); if (dataStream.mergedStreams.size() > 1) { for (int i = 1; i < dataStream.mergedStreams.size(); i++) { this.mergedStreams.add(new DataStream<OUT>(dataStream.mergedStreams.get(i))); } } } /** * Partitioning strategy on the stream. */ public static enum ConnectionType { SHUFFLE, BROADCAST, FIELD, FORWARD, DISTRIBUTE } /** * Returns the ID of the {@link DataStream}. * * @return ID of the DataStream */ public String getId() { return id; } /** * Gets the degree of parallelism for this operator. * * @return The parallelism set for this operator. */ public int getParallelism() { return this.degreeOfParallelism; } /** * Gets the output type. * * @return The output type. */ public TypeInformation<OUT> getOutputType() { return this.outTypeWrapper.getTypeInfo(); } /** * Gets the class of the field at the given position * * @param pos * Position of the field * @return The class of the field */ @SuppressWarnings("rawtypes") protected Class<?> getClassAtPos(int pos) { Class<?> type; TypeInformation<OUT> outTypeInfo = outTypeWrapper.getTypeInfo(); if (outTypeInfo.isTupleType()) { type = ((TupleTypeInfo) outTypeInfo).getTypeAt(pos).getTypeClass(); } else if (outTypeInfo instanceof BasicArrayTypeInfo) { type = ((BasicArrayTypeInfo) outTypeInfo).getComponentTypeClass(); } else if (outTypeInfo instanceof PrimitiveArrayTypeInfo) { Class<?> clazz = outTypeInfo.getTypeClass(); if (clazz == boolean[].class) { type = Boolean.class; } else if (clazz == short[].class) { type = Short.class; } else if (clazz == int[].class) { type = Integer.class; } else if (clazz == long[].class) { type = Long.class; } else if (clazz == float[].class) { type = Float.class; } else if (clazz == double[].class) { type = Double.class; } else if (clazz == char[].class) { type = Character.class; } else { throw new IndexOutOfBoundsException("Type could not be determined for array"); } } else if (pos == 0) { type = outTypeInfo.getTypeClass(); } else { throw new IndexOutOfBoundsException("Position is out of range"); } return type; } /** * Checks if the given field position is allowed for the output type * * @param pos * Position to check */ protected void checkFieldRange(int pos) { try { getClassAtPos(pos); } catch (IndexOutOfBoundsException e) { throw new RuntimeException("Selected field is out of range"); } } /** * Creates a new {@link DataStream} by merging {@link DataStream} outputs of * the same type with each other. The DataStreams merged using this operator * will be transformed simultaneously. * * @param streams * The DataStreams to merge output with. * @return The {@link DataStream}. */ public DataStream<OUT> merge(DataStream<OUT>... streams) { DataStream<OUT> returnStream = this.copy(); for (DataStream<OUT> stream : streams) { for (DataStream<OUT> ds : stream.mergedStreams) { validateMerge(ds.getId()); returnStream.mergedStreams.add(ds.copy()); } } return returnStream; } private void validateMerge(String id) { for (DataStream<OUT> ds : this.mergedStreams) { if (ds.getId().equals(id)) { throw new RuntimeException("A DataStream cannot be merged with itself"); } } } /** * Creates a new {@link ConnectedDataStream} by connecting * {@link DataStream} outputs of different type with each other. The * DataStreams connected using this operators can be used with CoFunctions. * * @param dataStream * The DataStream with which this stream will be joined. * @return The {@link ConnectedDataStream}. */ public <R> ConnectedDataStream<OUT, R> connect(DataStream<R> dataStream) { return new ConnectedDataStream<OUT, R>(this, dataStream); } /** * Creates a cross (Cartesian product) of a data stream window. The user can * implement their own time stamps or use the system time by default. * * @param windowSize * Size of the windows that will be aligned for both streams in * milliseconds. * @param slideInterval * After every function call the windows will be slid by this * interval. * @param dataStreamToCross * @param windowSize * @param slideInterval * @return The transformed {@link DataStream}. */ public <IN2> SingleOutputStreamOperator<Tuple2<OUT, IN2>, ?> windowCross(DataStream<IN2> dataStreamToCross, long windowSize, long slideInterval) { return this.windowCross(dataStreamToCross, windowSize, slideInterval, new DefaultTimeStamp<OUT>(), new DefaultTimeStamp<IN2>()); } /** * Creates a cross (Cartesian product) of a data stream window. * * @param dataStreamToCross * {@link DataStream} to cross with. * @param windowSize * Size of the windows that will be aligned for both streams in * milliseconds. * @param slideInterval * After every function call the windows will be slid by this * interval. * @param timestamp1 * User defined time stamps for the first input. * @param timestamp2 * User defined time stamps for the second input. * @return The transformed {@link DataStream}. */ public <IN2> SingleOutputStreamOperator<Tuple2<OUT, IN2>, ?> windowCross(DataStream<IN2> dataStreamToCross, long windowSize, long slideInterval, TimeStamp<OUT> timestamp1, TimeStamp<IN2> timestamp2) { return this.connect(dataStreamToCross).windowCross(windowSize, slideInterval, timestamp1, timestamp2); } /** * Creates a join of a data stream based on the given positions. * * @param dataStreamToJoin * {@link DataStream} to join with. * @param windowSize * Size of the windows that will be aligned for both streams in * milliseconds. * @param slideInterval * After every function call the windows will be slid by this * interval. * @param fieldIn1 * The field in the first stream to be matched. * @param fieldIn2 * The field in the second stream to be matched. * @return The transformed {@link DataStream}. */ public <IN2> SingleOutputStreamOperator<Tuple2<OUT, IN2>, ?> windowJoin(DataStream<IN2> dataStreamToJoin, long windowSize, long slideInterval, int fieldIn1, int fieldIn2) { return this.windowJoin(dataStreamToJoin, windowSize, slideInterval, new DefaultTimeStamp<OUT>(), new DefaultTimeStamp<IN2>(), fieldIn1, fieldIn2); } /** * Creates a join of a data stream based on the given positions. * * @param dataStreamToJoin * {@link DataStream} to join with. * @param windowSize * Size of the windows that will be aligned for both streams in * milliseconds. * @param slideInterval * After every function call the windows will be slid by this * interval. * @param fieldIn1 * The field in the first stream to be matched. * @param fieldIn2 * The field in the second stream to be matched. * @return The transformed {@link DataStream}. */ public <IN2> SingleOutputStreamOperator<Tuple2<OUT, IN2>, ?> windowJoin(DataStream<IN2> dataStreamToJoin, long windowSize, long slideInterval, String fieldIn1, String fieldIn2) { return this.windowJoin(dataStreamToJoin, windowSize, slideInterval, new DefaultTimeStamp<OUT>(), new DefaultTimeStamp<IN2>(), fieldIn1, fieldIn2); } /** * Creates a join of a data stream based on the given positions. * * @param dataStreamToJoin * {@link DataStream} to join with. * @param windowSize * Size of the windows that will be aligned for both streams in * milliseconds. * @param slideInterval * After every function call the windows will be slid by this * interval. * @param timestamp1 * User defined time stamps for the first input. * @param timestamp2 * User defined time stamps for the second input. * @param fieldIn1 * The field in the first stream to be matched. * @param fieldIn2 * The field in the second stream to be matched. * @return The transformed {@link DataStream}. */ public <IN2> SingleOutputStreamOperator<Tuple2<OUT, IN2>, ?> windowJoin(DataStream<IN2> dataStreamToJoin, long windowSize, long slideInterval, TimeStamp<OUT> timestamp1, TimeStamp<IN2> timestamp2, int fieldIn1, int fieldIn2) { return this.connect(dataStreamToJoin).windowJoin(windowSize, slideInterval, timestamp1, timestamp2, fieldIn1, fieldIn2); } /** * Creates a join of a data stream based on the given positions. * * @param dataStreamToJoin * {@link DataStream} to join with. * @param windowSize * Size of the windows that will be aligned for both streams in * milliseconds. * @param slideInterval * After every function call the windows will be slid by this * interval. * @param timestamp1 * User defined time stamps for the first input. * @param timestamp2 * User defined time stamps for the second input. * @param fieldIn1 * The field in the first stream to be matched. * @param fieldIn2 * The field in the second stream to be matched. * @return The transformed {@link DataStream}. */ public <IN2> SingleOutputStreamOperator<Tuple2<OUT, IN2>, ?> windowJoin(DataStream<IN2> dataStreamToJoin, long windowSize, long slideInterval, TimeStamp<OUT> timestamp1, TimeStamp<IN2> timestamp2, String fieldIn1, String fieldIn2) { return this.connect(dataStreamToJoin).windowJoin(windowSize, slideInterval, timestamp1, timestamp2, fieldIn1, fieldIn2); } /** * Sets the partitioning of the {@link DataStream} so that the output is * partitioned by the selected fields. * * @param fields * The fields to partition by. * @return The DataStream with fields partitioning set. */ public DataStream<OUT> partitionBy(int... fields) { return setConnectionType( new FieldsPartitioner<OUT>(FieldsKeySelector.getSelector(getOutputType(), fields))); } /** * Sets the partitioning of the {@link DataStream} so that the output is * partitioned by the given field expressions. * * @param fields * The fields expressions to partition by. * @return The DataStream with fields partitioning set. */ public DataStream<OUT> partitionBy(String... fields) { return setConnectionType(new FieldsPartitioner<OUT>(new PojoKeySelector<OUT>(getOutputType(), fields))); } /** * Sets the partitioning of the {@link DataStream} so that the output is * partitioned using the given {@link KeySelector}. * * @param keySelector * @return */ public DataStream<OUT> partitionBy(KeySelector<OUT, ?> keySelector) { return setConnectionType(new FieldsPartitioner<OUT>(keySelector)); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are broadcasted to every parallel instance of the next component. * * @return The DataStream with broadcast partitioning set. */ public DataStream<OUT> broadcast() { return setConnectionType(new BroadcastPartitioner<OUT>()); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are shuffled to the next component. * * @return The DataStream with shuffle partitioning set. */ public DataStream<OUT> shuffle() { return setConnectionType(new ShufflePartitioner<OUT>()); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are forwarded to the local subtask of the next component. This is the * default partitioner setting. * * @return The DataStream with shuffle partitioning set. */ public DataStream<OUT> forward() { return setConnectionType(new ForwardPartitioner<OUT>()); } /** * Sets the partitioning of the {@link DataStream} so that the output tuples * are distributed evenly to the next component. * * @return The DataStream with shuffle partitioning set. */ public DataStream<OUT> distribute() { return setConnectionType(new DistributePartitioner<OUT>()); } /** * Applies a Map transformation on a {@link DataStream}. The transformation * calls a {@link MapFunction} for each element of the DataStream. Each * MapFunction call returns exactly one element. The user can also extend * {@link RichMapFunction} to gain access to other features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param mapper * The MapFunction that is called for each element of the * DataStream. * @param <R> * output type * @return The transformed {@link DataStream}. */ public <R> SingleOutputStreamOperator<R, ?> map(MapFunction<OUT, R> mapper) { FunctionTypeWrapper<OUT> inTypeWrapper = new FunctionTypeWrapper<OUT>(mapper, MapFunction.class, 0); FunctionTypeWrapper<R> outTypeWrapper = new FunctionTypeWrapper<R>(mapper, MapFunction.class, 1); return addFunction("map", mapper, inTypeWrapper, outTypeWrapper, new MapInvokable<OUT, R>(mapper)); } /** * Applies a FlatMap transformation on a {@link DataStream}. The * transformation calls a {@link FlatMapFunction} for each element of the * DataStream. Each FlatMapFunction call can return any number of elements * including none. The user can also extend {@link RichFlatMapFunction} to * gain access to other features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param flatMapper * The FlatMapFunction that is called for each element of the * DataStream * * @param <R> * output type * @return The transformed {@link DataStream}. */ public <R> SingleOutputStreamOperator<R, ?> flatMap(FlatMapFunction<OUT, R> flatMapper) { FunctionTypeWrapper<OUT> inTypeWrapper = new FunctionTypeWrapper<OUT>(flatMapper, FlatMapFunction.class, 0); FunctionTypeWrapper<R> outTypeWrapper = new FunctionTypeWrapper<R>(flatMapper, FlatMapFunction.class, 1); return addFunction("flatMap", flatMapper, inTypeWrapper, outTypeWrapper, new FlatMapInvokable<OUT, R>(flatMapper)); } /** * Applies a reduce transformation on the data stream. The user can also * extend the {@link RichReduceFunction} to gain access to other features * provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param reducer * The {@link ReduceFunction} that will be called for every * element of the input values. * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> reduce(ReduceFunction<OUT> reducer) { return addFunction("reduce", reducer, new FunctionTypeWrapper<OUT>(reducer, ReduceFunction.class, 0), new FunctionTypeWrapper<OUT>(reducer, ReduceFunction.class, 0), new StreamReduceInvokable<OUT>(reducer)); } /** * Initiates a Project transformation on a {@link Tuple} {@link DataStream}.<br/> * <b>Note: Only Tuple DataStreams can be projected.</b></br> The * transformation projects each Tuple of the DataSet onto a (sub)set of * fields.</br> This method returns a {@link StreamProjection} on which * {@link StreamProjection#types(Class)} needs to be called to completed the * transformation. * * @param fieldIndexes * The field indexes of the input tuples that are retained. The * order of fields in the output tuple corresponds to the order * of field indexes. * @return A StreamProjection that needs to be converted into a DataStream * to complete the project transformation by calling * {@link StreamProjection#types(Class)}. * * @see Tuple * @see DataStream */ public StreamProjection<OUT> project(int... fieldIndexes) { return new StreamProjection<OUT>(this.copy(), fieldIndexes); } /** * Groups the elements of a {@link DataStream} by the given key positions to * be used with grouped operators like * {@link GroupedDataStream#reduce(ReduceFunction)} * * @param fields * The position of the fields on which the {@link DataStream} * will be grouped. * @return The grouped {@link DataStream} */ public GroupedDataStream<OUT> groupBy(int... fields) { return groupBy(FieldsKeySelector.getSelector(getOutputType(), fields)); } /** * Groups a {@link DataStream} using field expressions. A field expression * is either the name of a public field or a getter method with parentheses * of the {@link DataStream}S underlying type. A dot can be used to drill * down into objects, as in {@code "field1.getInnerField2()" }. This method * returns an {@link GroupedDataStream}. * * @param fields * One or more field expressions on which the DataStream will be * grouped. * @return The grouped {@link DataStream} **/ public GroupedDataStream<OUT> groupBy(String... fields) { return groupBy(new PojoKeySelector<OUT>(getOutputType(), fields)); } /** * Groups the elements of a {@link DataStream} by the key extracted by the * {@link KeySelector} to be used with grouped operators like * {@link GroupedDataStream#reduce(ReduceFunction)} * * @param keySelector * The {@link KeySelector} that will be used to extract keys for * the values * @return The grouped {@link DataStream} */ public GroupedDataStream<OUT> groupBy(KeySelector<OUT, ?> keySelector) { return new GroupedDataStream<OUT>(this, keySelector); } /** * Collects the data stream elements into sliding batches creating a new * {@link BatchedDataStream}. The user can apply transformations like * {@link BatchedDataStream#reduce}, {@link BatchedDataStream#reduceGroup} * or aggregations on the {@link BatchedDataStream}. * * @param batchSize * The number of elements in each batch at each operator * @param slideSize * The number of elements with which the batches are slid by * after each transformation. * @return The transformed {@link DataStream} */ public BatchedDataStream<OUT> batch(long batchSize, long slideSize) { if (batchSize < 1) { throw new IllegalArgumentException("Batch size must be positive"); } if (slideSize < 1) { throw new IllegalArgumentException("Slide size must be positive"); } return new BatchedDataStream<OUT>(this, batchSize, slideSize); } /** * Collects the data stream elements into sliding batches creating a new * {@link BatchedDataStream}. The user can apply transformations like * {@link BatchedDataStream#reduce}, {@link BatchedDataStream#reduceGroup} * or aggregations on the {@link BatchedDataStream}. * * @param batchSize * The number of elements in each batch at each operator * @return The transformed {@link DataStream} */ public BatchedDataStream<OUT> batch(long batchSize) { return batch(batchSize, batchSize); } /** * Collects the data stream elements into sliding windows creating a new * {@link WindowDataStream}. The user can apply transformations like * {@link WindowDataStream#reduce}, {@link WindowDataStream#reduceGroup} or * aggregations on the {@link WindowDataStream}. * * @param windowSize * The length of the window in milliseconds. * @param slideInterval * The number of milliseconds with which the windows are slid by * after each transformation. * @param timestamp * User defined function for extracting time-stamps from each * element * @return The transformed {@link DataStream} */ public WindowDataStream<OUT> window(long windowSize, long slideInterval, TimeStamp<OUT> timestamp) { if (windowSize < 1) { throw new IllegalArgumentException("Window size must be positive"); } if (slideInterval < 1) { throw new IllegalArgumentException("Slide interval must be positive"); } return new WindowDataStream<OUT>(this, windowSize, slideInterval, timestamp); } /** * Collects the data stream elements into sliding windows creating a new * {@link WindowDataStream}. The user can apply transformations like * {@link WindowDataStream#reduce}, {@link WindowDataStream#reduceGroup} or * aggregations on the {@link WindowDataStream}. * * @param windowSize * The length of the window in milliseconds. * @param slideInterval * The number of milliseconds with which the windows are slid by * after each transformation. * @return The transformed {@link DataStream} */ public WindowDataStream<OUT> window(long windowSize, long slideInterval) { return window(windowSize, slideInterval, new DefaultTimeStamp<OUT>()); } /** * Collects the data stream elements into sliding windows creating a new * {@link WindowDataStream}. The user can apply transformations like * {@link WindowDataStream#reduce}, {@link WindowDataStream#reduceGroup} or * aggregations on the {@link WindowDataStream}. * * @param windowSize * The length of the window in milliseconds. * @return The transformed {@link DataStream} */ public WindowDataStream<OUT> window(long windowSize) { return window(windowSize, windowSize); } /** * Applies an aggregation that sums the data stream at the given position. * * @param positionToSum * The position in the data point to sum * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> sum(int positionToSum) { checkFieldRange(positionToSum); return aggregate((AggregationFunction<OUT>) SumAggregator.getSumFunction(positionToSum, getClassAtPos(positionToSum), getOutputType())); } /** * Applies an aggregation that that gives the sum of the pojo data stream at * the given field expression. A field expression is either the name of a * public field or a getter method with parentheses of the * {@link DataStream}S underlying type. A dot can be used to drill down into * objects, as in {@code "field1.getInnerField2()" }. * * @param field * The field expression based on which the aggregation will be * applied. * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> sum(String field) { return aggregate((AggregationFunction<OUT>) SumAggregator.getSumFunction(field, getOutputType())); } /** * Syntactic sugar for sum(0) * * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> sum() { return sum(0); } /** * Applies an aggregation that that gives the minimum of the data stream at * the given position. * * @param positionToMin * The position in the data point to minimize * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> min(int positionToMin) { checkFieldRange(positionToMin); return aggregate(ComparableAggregator.getAggregator(positionToMin, getOutputType(), AggregationType.MIN)); } /** * Applies an aggregation that that gives the minimum of the pojo data * stream at the given field expression. A field expression is either the * name of a public field or a getter method with parentheses of the * {@link DataStream}S underlying type. A dot can be used to drill down into * objects, as in {@code "field1.getInnerField2()" }. * * @param field * The field expression based on which the aggregation will be * applied. * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> min(String field) { return aggregate(ComparableAggregator.getAggregator(field, getOutputType(), AggregationType.MIN, false)); } /** * Applies an aggregation that that gives the maximum of the pojo data * stream at the given field expression. A field expression is either the * name of a public field or a getter method with parentheses of the * {@link DataStream}S underlying type. A dot can be used to drill down into * objects, as in {@code "field1.getInnerField2()" }. * * @param field * The field expression based on which the aggregation will be * applied. * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> max(String field) { return aggregate(ComparableAggregator.getAggregator(field, getOutputType(), AggregationType.MAX, false)); } /** * Applies an aggregation that that gives the minimum element of the pojo * data stream by the given field expression. A field expression is either * the name of a public field or a getter method with parentheses of the * {@link DataStream}S underlying type. A dot can be used to drill down into * objects, as in {@code "field1.getInnerField2()" }. * * @param field * The field expression based on which the aggregation will be * applied. * @param first * If True then in case of field equality the first object will * be returned * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> minBy(String field, boolean first) { return aggregate(ComparableAggregator.getAggregator(field, getOutputType(), AggregationType.MINBY, first)); } /** * Applies an aggregation that that gives the maximum element of the pojo * data stream by the given field expression. A field expression is either * the name of a public field or a getter method with parentheses of the * {@link DataStream}S underlying type. A dot can be used to drill down into * objects, as in {@code "field1.getInnerField2()" }. * * @param field * The field expression based on which the aggregation will be * applied. * @param first * If True then in case of field equality the first object will * be returned * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> maxBy(String field, boolean first) { return aggregate(ComparableAggregator.getAggregator(field, getOutputType(), AggregationType.MAXBY, first)); } /** * Applies an aggregation that that gives the current element with the * minimum value at the given position, if more elements have the minimum * value at the given position, the operator returns the first one by * default. * * @param positionToMinBy * The position in the data point to minimize * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> minBy(int positionToMinBy) { return this.minBy(positionToMinBy, true); } /** * Applies an aggregation that that gives the current element with the * minimum value at the given position, if more elements have the minimum * value at the given position, the operator returns either the first or * last one, depending on the parameter set. * * @param positionToMinBy * The position in the data point to minimize * @param first * If true, then the operator return the first element with the * minimal value, otherwise returns the last * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> minBy(int positionToMinBy, boolean first) { checkFieldRange(positionToMinBy); return aggregate( ComparableAggregator.getAggregator(positionToMinBy, getOutputType(), AggregationType.MINBY, first)); } /** * Syntactic sugar for min(0) * * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> min() { return min(0); } /** * Applies an aggregation that gives the maximum of the data stream at the * given position. * * @param positionToMax * The position in the data point to maximize * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> max(int positionToMax) { checkFieldRange(positionToMax); return aggregate(ComparableAggregator.getAggregator(positionToMax, getOutputType(), AggregationType.MAX)); } /** * Applies an aggregation that that gives the current element with the * maximum value at the given position, if more elements have the maximum * value at the given position, the operator returns the first one by * default. * * @param positionToMaxBy * The position in the data point to maximize * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> maxBy(int positionToMaxBy) { return this.maxBy(positionToMaxBy, true); } /** * Applies an aggregation that that gives the current element with the * maximum value at the given position, if more elements have the maximum * value at the given position, the operator returns either the first or * last one, depending on the parameter set. * * @param positionToMaxBy * The position in the data point to maximize. * @param first * If true, then the operator return the first element with the * maximum value, otherwise returns the last * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> maxBy(int positionToMaxBy, boolean first) { checkFieldRange(positionToMaxBy); return aggregate( ComparableAggregator.getAggregator(positionToMaxBy, getOutputType(), AggregationType.MAXBY, first)); } /** * Syntactic sugar for max(0) * * @return The transformed DataStream. */ public SingleOutputStreamOperator<OUT, ?> max() { return max(0); } /** * Applies an aggregation that gives the count of the values. * * @return The transformed DataStream. */ public SingleOutputStreamOperator<Long, ?> count() { TypeWrapper<OUT> inTypeWrapper = outTypeWrapper; TypeWrapper<Long> outTypeWrapper = new ObjectTypeWrapper<Long>(Long.valueOf(0)); return addFunction("counter", null, inTypeWrapper, outTypeWrapper, new CounterInvokable<OUT>()); } protected SingleOutputStreamOperator<OUT, ?> aggregate(AggregationFunction<OUT> aggregate) { StreamReduceInvokable<OUT> invokable = new StreamReduceInvokable<OUT>(aggregate); SingleOutputStreamOperator<OUT, ?> returnStream = addFunction("reduce", aggregate, outTypeWrapper, outTypeWrapper, invokable); return returnStream; } /** * Applies a Filter transformation on a {@link DataStream}. The * transformation calls a {@link FilterFunction} for each element of the * DataStream and retains only those element for which the function returns * true. Elements for which the function returns false are filtered. The * user can also extend {@link RichFilterFunction} to gain access to other * features provided by the * {@link org.apache.flink.api.common.functions.RichFunction} interface. * * @param filter * The FilterFunction that is called for each element of the * DataSet. * @return The filtered DataStream. */ public SingleOutputStreamOperator<OUT, ?> filter(FilterFunction<OUT> filter) { FunctionTypeWrapper<OUT> typeWrapper = new FunctionTypeWrapper<OUT>(filter, FilterFunction.class, 0); return addFunction("filter", filter, typeWrapper, typeWrapper, new FilterInvokable<OUT>(filter)); } /** * Writes a DataStream to the standard output stream (stdout). For each * element of the DataStream the result of {@link Object#toString()} is * written. * * @return The closed DataStream. */ public DataStreamSink<OUT> print() { DataStream<OUT> inputStream = this.copy(); PrintSinkFunction<OUT> printFunction = new PrintSinkFunction<OUT>(); DataStreamSink<OUT> returnStream = addSink(inputStream, printFunction, outTypeWrapper); return returnStream; } /** * Writes a DataStream to the file specified by path in text format. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsText(String path) { return writeAsText(this, path, new WriteFormatAsText<OUT>(), 1, null); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically, in every millis milliseconds. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * @param millis * is the file update frequency * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsText(String path, long millis) { return writeAsText(this, path, new WriteFormatAsText<OUT>(), millis, null); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically in equally sized batches. For every * element of the DataStream the result of {@link Object#toString()} is * written. * * @param path * is the path to the location where the tuples are written * @param batchSize * is the size of the batches, i.e. the number of tuples written * to the file at a time * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsText(String path, int batchSize) { return writeAsText(this, path, new WriteFormatAsText<OUT>(), batchSize, null); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically, in every millis milliseconds. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * @param millis * is the file update frequency * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * system time. * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsText(String path, long millis, OUT endTuple) { return writeAsText(this, path, new WriteFormatAsText<OUT>(), millis, endTuple); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically in equally sized batches. For every * element of the DataStream the result of {@link Object#toString()} is * written. * * @param path * is the path to the location where the tuples are written * @param batchSize * is the size of the batches, i.e. the number of tuples written * to the file at a time * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * batchSize. * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsText(String path, int batchSize, OUT endTuple) { return writeAsText(this, path, new WriteFormatAsText<OUT>(), batchSize, endTuple); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically, in every millis milliseconds. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * @param millis * is the file update frequency * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * system time. * * @return the data stream constructed */ private DataStreamSink<OUT> writeAsText(DataStream<OUT> inputStream, String path, WriteFormatAsText<OUT> format, long millis, OUT endTuple) { DataStreamSink<OUT> returnStream = addSink(inputStream, new WriteSinkFunctionByMillis<OUT>(path, format, millis, endTuple), inputStream.outTypeWrapper); jobGraphBuilder.setMutability(returnStream.getId(), false); return returnStream; } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically in equally sized batches. For every * element of the DataStream the result of {@link Object#toString()} is * written. * * @param path * is the path to the location where the tuples are written * @param batchSize * is the size of the batches, i.e. the number of tuples written * to the file at a time * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * batchSize. * * @return the data stream constructed */ private DataStreamSink<OUT> writeAsText(DataStream<OUT> inputStream, String path, WriteFormatAsText<OUT> format, int batchSize, OUT endTuple) { DataStreamSink<OUT> returnStream = addSink(inputStream, new WriteSinkFunctionByBatches<OUT>(path, format, batchSize, endTuple), inputStream.outTypeWrapper); jobGraphBuilder.setMutability(returnStream.getId(), false); return returnStream; } /** * Writes a DataStream to the file specified by path in text format. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsCsv(String path) { return writeAsCsv(this, path, new WriteFormatAsCsv<OUT>(), 1, null); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically, in every millis milliseconds. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * @param millis * is the file update frequency * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsCsv(String path, long millis) { return writeAsCsv(this, path, new WriteFormatAsCsv<OUT>(), millis, null); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically in equally sized batches. For every * element of the DataStream the result of {@link Object#toString()} is * written. * * @param path * is the path to the location where the tuples are written * @param batchSize * is the size of the batches, i.e. the number of tuples written * to the file at a time * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsCsv(String path, int batchSize) { return writeAsCsv(this, path, new WriteFormatAsCsv<OUT>(), batchSize, null); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically, in every millis milliseconds. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * @param millis * is the file update frequency * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * system time. * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsCsv(String path, long millis, OUT endTuple) { return writeAsCsv(this, path, new WriteFormatAsCsv<OUT>(), millis, endTuple); } /** * Writes a DataStream to the file specified by path in text format. The * writing is performed periodically in equally sized batches. For every * element of the DataStream the result of {@link Object#toString()} is * written. * * @param path * is the path to the location where the tuples are written * @param batchSize * is the size of the batches, i.e. the number of tuples written * to the file at a time * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * batchSize. * * @return The closed DataStream */ public DataStreamSink<OUT> writeAsCsv(String path, int batchSize, OUT endTuple) { if (this instanceof SingleOutputStreamOperator) { ((SingleOutputStreamOperator<?, ?>) this).setMutability(false); } return writeAsCsv(this, path, new WriteFormatAsCsv<OUT>(), batchSize, endTuple); } /** * Writes a DataStream to the file specified by path in csv format. The * writing is performed periodically, in every millis milliseconds. For * every element of the DataStream the result of {@link Object#toString()} * is written. * * @param path * is the path to the location where the tuples are written * @param millis * is the file update frequency * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * system time. * * @return the data stream constructed */ private DataStreamSink<OUT> writeAsCsv(DataStream<OUT> inputStream, String path, WriteFormatAsCsv<OUT> format, long millis, OUT endTuple) { DataStreamSink<OUT> returnStream = addSink(inputStream, new WriteSinkFunctionByMillis<OUT>(path, format, millis, endTuple), inputStream.outTypeWrapper); jobGraphBuilder.setMutability(returnStream.getId(), false); return returnStream; } /** * Writes a DataStream to the file specified by path in csv format. The * writing is performed periodically in equally sized batches. For every * element of the DataStream the result of {@link Object#toString()} is * written. * * @param path * is the path to the location where the tuples are written * @param batchSize * is the size of the batches, i.e. the number of tuples written * to the file at a time * @param endTuple * is a special tuple indicating the end of the stream. If an * endTuple is caught, the last pending batch of tuples will be * immediately appended to the target file regardless of the * batchSize. * * @return the data stream constructed */ private DataStreamSink<OUT> writeAsCsv(DataStream<OUT> inputStream, String path, WriteFormatAsCsv<OUT> format, int batchSize, OUT endTuple) { DataStreamSink<OUT> returnStream = addSink(inputStream, new WriteSinkFunctionByBatches<OUT>(path, format, batchSize, endTuple), inputStream.outTypeWrapper); jobGraphBuilder.setMutability(returnStream.getId(), false); return returnStream; } /** * Initiates an iterative part of the program that executes multiple times * and feeds back data streams. The iterative part needs to be closed by * calling {@link IterativeDataStream#closeWith(DataStream)}. The * transformation of this IterativeDataStream will be the iteration head. * The data stream given to the {@code closeWith(DataStream)} method is the * data stream that will be fed back and used as the input for the iteration * head. Unlike in batch processing by default the output of the iteration * stream is directed to both to the iteration head and the next component. * To direct tuples to the iteration head or the output specifically one can * use the {@code split(OutputSelector)} on the iteration tail while * referencing the iteration head as 'iterate'. * <p> * The iteration edge will be partitioned the same way as the first input of * the iteration head. * <p> * By default a DataStream with iteration will never terminate, but the user * can use the {@link IterativeDataStream#setMaxWaitTime} call to set a max * waiting time for the iteration. * * @return The iterative data stream created. */ public IterativeDataStream<OUT> iterate() { return new IterativeDataStream<OUT>(this); } protected <R> DataStream<OUT> addIterationSource(String iterationID, long waitTime) { DataStream<R> returnStream = new DataStreamSource<R>(environment, "iterationSource", null); jobGraphBuilder.addIterationHead(returnStream.getId(), this.getId(), iterationID, degreeOfParallelism, waitTime); return this.copy(); } /** * Internal function for passing the user defined functions to the JobGraph * of the job. * * @param functionName * name of the function * @param function * the user defined function * @param functionInvokable * the wrapping JobVertex instance * @param <R> * type of the return stream * @return the data stream constructed */ protected <R> SingleOutputStreamOperator<R, ?> addFunction(String functionName, final Function function, TypeWrapper<OUT> inTypeWrapper, TypeWrapper<R> outTypeWrapper, StreamInvokable<OUT, R> functionInvokable) { DataStream<OUT> inputStream = this.copy(); @SuppressWarnings({ "unchecked", "rawtypes" }) SingleOutputStreamOperator<R, ?> returnStream = new SingleOutputStreamOperator(environment, functionName, outTypeWrapper); try { jobGraphBuilder.addStreamVertex(returnStream.getId(), functionInvokable, inTypeWrapper, outTypeWrapper, functionName, SerializationUtils.serialize((Serializable) function), degreeOfParallelism); } catch (SerializationException e) { throw new RuntimeException("Cannot serialize user defined function"); } connectGraph(inputStream, returnStream.getId(), 0); if (inputStream instanceof IterativeDataStream) { IterativeDataStream<OUT> iterativeStream = (IterativeDataStream<OUT>) inputStream; returnStream.addIterationSource(iterativeStream.iterationID.toString(), iterativeStream.waitTime); } return returnStream; } /** * Internal function for setting the partitioner for the DataStream * * @param partitioner * Partitioner to set. * @return The modified DataStream. */ protected DataStream<OUT> setConnectionType(StreamPartitioner<OUT> partitioner) { DataStream<OUT> returnStream = this.copy(); for (DataStream<OUT> stream : returnStream.mergedStreams) { stream.partitioner = partitioner; } return returnStream; } /** * Internal function for assembling the underlying * {@link org.apache.flink.runtime.jobgraph.JobGraph} of the job. Connects * the outputs of the given input stream to the specified output stream * given by the outputID. * * @param inputStream * input data stream * @param outputID * ID of the output * @param typeNumber * Number of the type (used at co-functions) */ protected <X> void connectGraph(DataStream<X> inputStream, String outputID, int typeNumber) { for (DataStream<X> stream : inputStream.mergedStreams) { jobGraphBuilder.setEdge(stream.getId(), outputID, stream.partitioner, typeNumber, inputStream.userDefinedNames, inputStream.selectAll); } } /** * Adds the given sink to this DataStream. Only streams with sinks added * will be executed once the {@link StreamExecutionEnvironment#execute()} * method is called. * * @param sinkFunction * The object containing the sink's invoke function. * @return The closed DataStream. */ public DataStreamSink<OUT> addSink(SinkFunction<OUT> sinkFunction) { return addSink(this.copy(), sinkFunction); } private DataStreamSink<OUT> addSink(DataStream<OUT> inputStream, SinkFunction<OUT> sinkFunction) { return addSink(inputStream, sinkFunction, new FunctionTypeWrapper<OUT>(sinkFunction, SinkFunction.class, 0)); } private DataStreamSink<OUT> addSink(DataStream<OUT> inputStream, SinkFunction<OUT> sinkFunction, TypeWrapper<OUT> inTypeWrapper) { DataStreamSink<OUT> returnStream = new DataStreamSink<OUT>(environment, "sink", outTypeWrapper); try { jobGraphBuilder.addStreamVertex(returnStream.getId(), new SinkInvokable<OUT>(sinkFunction), inTypeWrapper, null, "sink", SerializationUtils.serialize(sinkFunction), degreeOfParallelism); } catch (SerializationException e) { throw new RuntimeException("Cannot serialize SinkFunction"); } inputStream.connectGraph(inputStream.copy(), returnStream.getId(), 0); return returnStream; } /** * Creates a copy of the {@link DataStream} * * @return The copy */ protected DataStream<OUT> copy() { return new DataStream<OUT>(this); } }