Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import com.google.common.collect.Lists; import org.apache.flink.api.common.functions.FilterFunction; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.utils.ParameterTool; import org.apache.flink.cep.CEP; import org.apache.flink.cep.PatternSelectFunction; import org.apache.flink.cep.PatternStream; import org.apache.flink.cep.pattern.Pattern; import org.apache.flink.cep.pattern.conditions.IterativeCondition; import org.apache.flink.cep.pattern.conditions.SimpleCondition; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.QueryableStateStream; import org.apache.flink.streaming.api.datastream.WindowedStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.AscendingTimestampExtractor; import org.apache.flink.streaming.api.functions.TimestampAssigner; import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; import org.apache.flink.streaming.connectors.kafka.*; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkFixedPartitioner; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; import org.apache.flink.streaming.connectors.twitter.TwitterSource; import org.apache.flink.streaming.api.windowing.time.Time; import org.apache.flink.streaming.util.serialization.SerializationSchema; import org.apache.flink.streaming.util.serialization.SimpleStringSchema; import org.apache.flink.table.api.Table; import org.apache.flink.table.api.TableEnvironment; import org.apache.flink.table.api.java.StreamTableEnvironment; import org.apache.flink.types.Row; import org.apache.flink.util.Collector; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import javax.json.Json; import java.io.*; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.util.*; /** * Implements the "TwitterStream" program that computes a most used word * occurrence over JSON objects in a streaming fashion. * <p> * The input is a Tweet stream from a TwitterSource. * </p> * <p> * Usage: <code>Usage: TwitterExample [--output <path>] * [--twitter-source.consumerKey <key> --twitter-source.consumerSecret <secret> --twitter-source.token <token> --twitter-source.tokenSecret <tokenSecret>]</code><br> * * If no parameters are provided, the program is run with default data from * {@link TwitterExampleData}. * </p> * <p> * This example shows how to: * <ul> * <li>acquire external data, * <li>use in-line defined functions, * <li>handle flattened stream inputs. * </ul> */ public class TwitterExample { // ************************************************************************* // PROGRAM // ************************************************************************* public static Vector<String> initArrayList(String path, ClassLoader cl) throws FileNotFoundException, UnsupportedEncodingException { InputStream is = cl.getResourceAsStream(path); BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); Vector<String> wordStops = new Vector<>(); try { String line = br.readLine(); while (line != null) { wordStops.add(line); line = br.readLine(); } br.close(); } catch (Exception e) { e.printStackTrace(); } return wordStops; } public static String tokenize(String sentence) { StringBuilder s = new StringBuilder(); for (int i = 0; i < sentence.length(); i++) { char c = sentence.charAt(i); if (Character.isLetter(c)) { s.append(c); } else if (c == ' ' || c == '\t' || c == '\r') { s.append(' '); } else { s.append(' '); } } return s.toString(); } public static FlinkKafkaProducer010 initKafkaProducer(String host, String topic) { FlinkKafkaProducer010<String> myProducer = new FlinkKafkaProducer010<String>(host, // broker list topic, // target topic new SimpleStringSchema()); // serialization schema // the following is necessary for at-least-once delivery guarantee myProducer.setLogFailuresOnly(false); // "false" by default myProducer.setFlushOnCheckpoint(true); // "false" by default return myProducer; } private static Kafka010JSONTableSink makeTableSink(String theTopic, Properties myProperties) { FlinkKafkaPartitioner<Row> row2 = new FlinkFixedPartitioner<>(); return new Kafka010JSONTableSink(theTopic, myProperties, row2); } public static void main(String[] args) throws Exception { //Use class loader to load the file ClassLoader classloader = Thread.currentThread().getContextClassLoader(); InputStream is = classloader.getResourceAsStream("myFile.properties"); // copy config from Java resource to a file File configOnDisk = new File("myFile.properties"); Files.copy(classloader.getResourceAsStream("myFile.properties"), configOnDisk.toPath(), StandardCopyOption.REPLACE_EXISTING); final ParameterTool params = ParameterTool.fromPropertiesFile("myFile.properties"); System.out.println("Usage: TwitterExample [--output <path>] " + "[--twitter-source.consumerKey <key> --twitter-source.consumerSecret <secret> --twitter-source.token <token> --twitter-source.tokenSecret <tokenSecret>]"); // set up the execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // make parameters available in the web interface env.getConfig().setGlobalJobParameters(params); env.setParallelism(params.getInt("parallelism", 1)); //env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); //DataStream<String> streamSource = env.addSource(new TwitterSource("/myFile.properties")); System.out.println(" This is the param" + params.getProperties()); // get input data DataStream<String> streamSource; if (params.has(TwitterSource.CONSUMER_KEY) && params.has(TwitterSource.CONSUMER_SECRET) && params.has(TwitterSource.TOKEN) && params.has(TwitterSource.TOKEN_SECRET)) { final Vector<String> theList = initArrayList("words.txt", classloader); //Find tweets about Trump and Clinton TwitterSource twitterA = new TwitterSource(params.getProperties()); TwitterSourceOpt.FilterEndpoint i = new TwitterSourceOpt.FilterEndpoint(theList); twitterA.setCustomEndpointInitializer(i); streamSource = env.addSource(twitterA); } else { System.out.println("Executing TwitterStream example with default props."); System.out.println("Use --twitter-source.consumerKey <key> --twitter-source.consumerSecret <secret> " + "--twitter-source.token <token> --twitter-source.tokenSecret <tokenSecret> specify the authentication info."); // get default test text data streamSource = env.fromElements(TwitterExampleData.TEXTS); } final Vector<String> stopWords = initArrayList("stopwords.txt", classloader); DataStream<Tuple2<String, Integer>> tweets = streamSource // selecting English tweets and splitting to (word, 1) .flatMap(new SelectEnglishAndTokenizeFlatMap("text")); //Get locations DataStream<Tuple2<String, Integer>> locations = streamSource .flatMap(new SelectEnglishAndTokenizeFlatMap("location")).keyBy(0).sum(1); tweets.keyBy(0).asQueryableState("Twitter tweets by key"); //Filter out stop words tweets = tweets.filter(new FilterFunction<Tuple2<String, Integer>>() { public boolean filter(Tuple2<String, Integer> value) { String word = value.getField(0); return !stopWords.contains(word); } }); DataStream<Tuple2<String, Integer>> dataWindowKafka = tweets.keyBy(0).timeWindow(Time.seconds(10)).sum(1) .filter(new FilterFunction<Tuple2<String, Integer>>() { public boolean filter(Tuple2<String, Integer> value) { int s = value.getField(1); return s > 10; } }); dataWindowKafka.map(new JSONIZEString()); Pattern<Tuple2<String, Integer>, ?> pattern = Pattern.<Tuple2<String, Integer>>begin("first") .where(new SimpleCondition2(15)).followedBy("increasing").where(new SimpleCondition2(20)) .followedBy("End").where(new IterativeCondition<Tuple2<String, Integer>>() { @Override public boolean filter(Tuple2<String, Integer> stringIntegerTuple2, Context<Tuple2<String, Integer>> context) throws Exception { List<Tuple2<String, Integer>> s = Lists.newArrayList(context.getEventsForPattern("End")); int i = s.size(); int value = stringIntegerTuple2.getField(1); int prevValue = s.get(i - 1).getField(1); return value > prevValue; } }); PatternStream<Tuple2<String, Integer>> patternStream = CEP.pattern(dataWindowKafka.keyBy(0), pattern); DataStream<String> manyMentions = patternStream .select(new PatternSelectFunction<Tuple2<String, Integer>, String>() { @Override public String select(Map<String, List<Tuple2<String, Integer>>> map) throws Exception { System.out.println(map.toString()); return "the word " + map.toString(); } }); System.out.println(manyMentions.writeAsText("alert.txt")); //Temporarily disabled Kafka for testing purposes uncomment the following to re-enable //Initialize a Kafka producer that will be consumed by D3.js and (possibly the database). //FlinkKafkaProducer010 myProducer = initKafkaProducer("localhost:9092","test"); //dataWindowKafka.map(new JSONIZEString()).addSink(myProducer); //Transition to a table environment StreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env); // tableEnv.registerDataStream("myTable2", dataWindowKafka, "word, count"); Table table2 = tableEnv.fromDataStream(dataWindowKafka, "word, count"); // Confusing //System.out.println("This is the table name " + table2.where("count>5")); // Using a CSV TableSink //TableSink sink = new CsvTableSink("path54.csv", ","); //table2.writeToSink(sink); Properties kafkaProperties = new Properties(); kafkaProperties.setProperty("bootstrap.servers", "localhost:9092"); kafkaProperties.setProperty("group.id", "test"); kafkaProperties.setProperty("zookeeper.connect", "localhost:2181"); KafkaTableSink10 plotSink = makeTableSink("twitter", kafkaProperties); //table2.writeToSink(plotSink); env.execute("Twitter Streaming Example"); } // ************************************************************************* // USER FUNCTIONS // ************************************************************************* /** * Deserialize JSON from twitter source * * <p> * Implements a string tokenizer that splits sentences into words as a * user-defined FlatMapFunction. The function takes a line (String) and * splits it into multiple pairs in the form of "(word,1)" ({@code Tuple2<String, * Integer>}). */ public static class JSONIZEString implements MapFunction<Tuple2<String, Integer>, String> { public String map(Tuple2<String, Integer> in) { String jsonString = Json.createObjectBuilder().add("word", in.f0).add("count", in.f1) .add("time", System.currentTimeMillis()).build().toString(); //System.out.println(jsonString); return jsonString; } } public static class SelectEnglishAndTokenizeFlatMap implements FlatMapFunction<String, Tuple2<String, Integer>> { private static final long serialVersionUID = 1L; private String fieldName; private transient ObjectMapper jsonParser; SelectEnglishAndTokenizeFlatMap(String fieldName1) { fieldName = fieldName1; } /** * Select the language from the incoming JSON text */ private StringTokenizer getField(JsonNode jsonNode) { if (fieldName.equals("text") && jsonNode.has(fieldName)) { return new StringTokenizer(jsonNode.get(fieldName).getValueAsText()); } return new StringTokenizer(jsonNode.get("user").get(fieldName).getValueAsText()); } public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception { if (jsonParser == null) { jsonParser = new ObjectMapper(); } JsonNode jsonNode = jsonParser.readValue(value, JsonNode.class); boolean isEnglish = jsonNode.has("user") && jsonNode.get("user").has("lang") && jsonNode.get("user").get("lang").getValueAsText().equals("en"); if (isEnglish) { StringTokenizer tokenizer = getField(jsonNode); // split the message while (tokenizer.hasMoreTokens()) { String result = tokenizer.nextToken().replaceAll("\\s*", "").toLowerCase(); result = tokenize(result); if (!result.equals("")) { out.collect(new Tuple2<>(result, 1)); } } } } } }