List of usage examples for org.apache.spark.api.java.function VoidFunction2 VoidFunction2
VoidFunction2
From source file:StreamMain.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 4) { System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>"); System.exit(1);/*from www.jav a 2s . c o m*/ } // StreamingExamples.setStreamingLogLevels(); SparkConf sparkConf = new SparkConf().setAppName("StreamMain"); // Create the context with 2 seconds batch size JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(10000)); int numThreads = Integer.parseInt(args[3]); Map<String, Integer> topicMap = new HashMap<>(); String[] topics = args[2].split(","); for (String topic : topics) { topicMap.put(topic, numThreads); } JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, args[0], args[1], topicMap); JavaDStream<String> lines = messages.map(Tuple2::_2); JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) .reduceByKey((i1, i2) -> i1 + i2); wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() { @Override public void call(JavaPairRDD<String, Integer> stringIntegerJavaPairRDD, Time time) throws Exception { List<Tuple2<String, Integer>> output = stringIntegerJavaPairRDD.collect(); System.out.println("\n>>>RESULTS START\n"); System.out.println("Output Size : " + output.size()); output.forEach(t -> System.out.println(t._1() + ":" + t._2())); System.out.println("\n>>>RESULTS END\n"); } }); //wordCounts.foreachRDD(); jssc.start(); jssc.awaitTermination(); }
From source file:CollectAndPredict.java
public static void main(String[] args) { //StreamingExamples.setStreamingLogLevels(); // Set logging level if log4j not configured (override by adding log4j.properties to classpath) if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) { Logger.getRootLogger().setLevel(Level.WARN); }/*from w w w .jav a 2 s .co m*/ String OAuthConsumerKey = "QxeynbXmN93DuNiZKkYfZcS2o"; String OAuthConsumerSecret = "2rAzjerHeW6sIgeDim0A77iGaRn9O683m0DrTbBhaoIuRRq7oU"; String OAuthAccessToken = "825094416935297025-jCegwA25yj3QxF2rHeJ5hRoVu86AfaY"; String OAuthAccessTokenSecret = "CwfNmGcWHoL8qvr5dWDdknYM4k4KvAZc7XlGZuYl2DcR8"; String[] filters = Arrays.copyOfRange(args, 0, args.length); // Set the system properties so that Twitter4j library used by Twitter stream // can use them to generate OAuth credentials System.setProperty("twitter4j.oauth.consumerKey", OAuthConsumerKey); System.setProperty("twitter4j.oauth.consumerSecret", OAuthConsumerSecret); System.setProperty("twitter4j.oauth.accessToken", OAuthAccessToken); System.setProperty("twitter4j.oauth.accessTokenSecret", OAuthAccessTokenSecret); SparkConf sparkConf = new SparkConf().setAppName("JavaTwitterHashTagJoinSentiments"); // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]"); } SparkSession spark = SparkSession.builder().appName("teste2").config(sparkConf).getOrCreate(); JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(spark.sparkContext()), Seconds.apply(30)); TokenizerFactory tokFactory = TwitterTokenizerFactory.getTokFactory(); NaiveBayesModel model = NaiveBayesModel.load(spark.sparkContext(), "Docker/myNaiveBayesModel"); HashingTF hashingTF = new HashingTF(1000); JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(jssc, filters); JavaDStream<Tweet> statuses = stream.map((Status status) -> new Tweet() .addUser(new TwitterUser().addID(status.getUser().getId()).addName(status.getUser().getName()) .addLocation(status.getUser().getLocation()).addDateSignin(status.getUser().getCreatedAt()) .addCountTweets(status.getUser().getStatusesCount()) .addCountFavorites(status.getUser().getFavouritesCount()) .addCountFriends(status.getUser().getFriendsCount()) .addCountFollowers(status.getUser().getFollowersCount())) .addText(status.getText()).addID(status.getId()).addDate(status.getCreatedAt()) .addLatitude( status.getGeoLocation() != null ? status.getGeoLocation().getLatitude() : Double.MAX_VALUE) .addLongitude(status.getGeoLocation() != null ? status.getGeoLocation().getLongitude() : Double.MAX_VALUE)); statuses.foreachRDD(new VoidFunction2<JavaRDD<Tweet>, Time>() { long numTweetsCollected = 0; long numTweetsToCollect = 200; @Override public void call(JavaRDD<Tweet> t1, Time t2) throws Exception { List<Tweet> collect = t1.collect(); long count = collect.size(); if (count > 0) { for (Tweet tweet : collect) { String textoSemUrl = URLRemove.remove(tweet.getText()); Vector v = hashingTF.transform(Arrays.asList(tokFactory .tokenizer(textoSemUrl.toCharArray(), 0, textoSemUrl.length()).tokenize())); double predict = model.predict(v); if (predict == 1) { tweet.setClassifier("POSITIVE"); } else { tweet.setClassifier("NEGATIVE"); } } ObjectWriter ow = new ObjectMapper().writer().withDefaultPrettyPrinter(); try { ow.writeValue( new FileOutputStream(new File("Docker/Twitter" + t2.milliseconds() + ".json")), collect); } catch (Exception ex) { spark.log().error(ex.getMessage(), ex); } numTweetsCollected += count; spark.log().info("coletou :" + numTweetsCollected + " tweets"); if (numTweetsCollected > numTweetsToCollect) { System.exit(0); } } } }); // statuses.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { // long numTweetsCollected = 0; // long numTweetsToCollect = 200; // // @Override // public void call(JavaRDD<String> rdd, Time time) throws Exception { // long count = rdd.count(); // if (count > 0) { // JavaRDD<String> outputRDD = rdd.repartition(10); // outputRDD.saveAsTextFile("/Docker/tweets_" + time.milliseconds()); // numTweetsCollected += count; // if (numTweetsCollected > numTweetsToCollect) { // System.exit(0); // } // } // } // }); // JavaDStream<String> words = stream.flatMap(new FlatMapFunction<Status, String>() { // // @Override // public Iterable<String> call(Status t) throws Exception { // return Arrays.asList(t.getText().split(" ")); // } // }); // // JavaDStream<String> hashTags = words.filter(new Function<String, Boolean>() { // @Override // public Boolean call(String word) { // return word.startsWith("#"); // } // }); // // // Read in the word-sentiment list and create a static RDD from it // String wordSentimentFilePath = "streaming-twitter/examples/data/AFINN-111.txt"; // final JavaPairRDD<String, Double> wordSentiments = jssc.sparkContext() // .textFile(wordSentimentFilePath) // .mapToPair(new PairFunction<String, String, Double>() { // @Override // public Tuple2<String, Double> call(String line) { // String[] columns = line.split("\t"); // return new Tuple2<>(columns[0], Double.parseDouble(columns[1])); // } // }); // // JavaPairDStream<String, Integer> hashTagCount = hashTags.mapToPair( // new PairFunction<String, String, Integer>() { // @Override // public Tuple2<String, Integer> call(String s) { // // leave out the # character // return new Tuple2<>(s.substring(1), 1); // } // }); // // JavaPairDStream<String, Integer> hashTagTotals = hashTagCount.reduceByKeyAndWindow( // new Function2<Integer, Integer, Integer>() { // @Override // public Integer call(Integer a, Integer b) { // return a + b; // } // }, new Duration(10000)); // // // Determine the hash tags with the highest sentiment values by joining the streaming RDD // // with the static RDD inside the transform() method and then multiplying // // the frequency of the hash tag by its sentiment value // JavaPairDStream<String, Tuple2<Double, Integer>> joinedTuples // = hashTagTotals.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Tuple2<Double, Integer>>>() { // @Override // public JavaPairRDD<String, Tuple2<Double, Integer>> call( // JavaPairRDD<String, Integer> topicCount) { // return wordSentiments.join(topicCount); // } // }); // // JavaPairDStream<String, Double> topicHappiness = joinedTuples.mapToPair( // new PairFunction<Tuple2<String, Tuple2<Double, Integer>>, String, Double>() { // @Override // public Tuple2<String, Double> call(Tuple2<String, Tuple2<Double, Integer>> topicAndTuplePair) { // Tuple2<Double, Integer> happinessAndCount = topicAndTuplePair._2(); // return new Tuple2<>(topicAndTuplePair._1(), // happinessAndCount._1() * happinessAndCount._2()); // } // }); // // JavaPairDStream<Double, String> happinessTopicPairs = topicHappiness.mapToPair( // new PairFunction<Tuple2<String, Double>, Double, String>() { // @Override // public Tuple2<Double, String> call(Tuple2<String, Double> topicHappiness) { // return new Tuple2<>(topicHappiness._2(), // topicHappiness._1()); // } // }); // // JavaPairDStream<Double, String> happiest10 = happinessTopicPairs.transformToPair( // new Function<JavaPairRDD<Double, String>, JavaPairRDD<Double, String>>() { // @Override // public JavaPairRDD<Double, String> call( // JavaPairRDD<Double, String> happinessAndTopics) { // return happinessAndTopics.sortByKey(false); // } // } // ); // // // Print hash tags with the most positive sentiment values // happiest10.foreachRDD(new VoidFunction<JavaPairRDD<Double, String>>() { // @Override // public void call(JavaPairRDD<Double, String> happinessTopicPairs) { // List<Tuple2<Double, String>> topList = happinessTopicPairs.take(10); // System.out.println( // String.format("\nHappiest topics in last 10 seconds (%s total):", // happinessTopicPairs.count())); // for (Tuple2<Double, String> pair : topList) { // System.out.println( // String.format("%s (%s happiness)", pair._2(), pair._1())); // } // } // }); jssc.start(); try { jssc.awaitTermination(); } catch (InterruptedException e) { e.printStackTrace(); } }
From source file:cn.com.warlock.streaming.JavaRecoverableNetworkWordCount.java
License:Apache License
private static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) { // ? StreamingContext checkpoint System.out.println("Creating new context"); SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(10)); ssc.checkpoint(checkpointDirectory); // socket stream JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override/* w ww .j av a 2 s .co m*/ public Iterable<String> call(String x) { return Lists.newArrayList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() { @Override public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException { List<Tuple2<String, Integer>> list = rdd.collect(); Jedis jedis = null; try { jedis = JedisPoolHolder.getInstance().getResource(); for (Tuple2<String, Integer> tuple2 : list) { // ??redis???"word_" jedis.incrBy("word_" + tuple2._1(), tuple2._2()); } } catch (Exception e) { e.printStackTrace(); } finally { if (jedis != null) { jedis.close(); } } } }); return ssc; }
From source file:cn.com.warlock.streaming.JavaSqlNetworkWordCount.java
License:Apache License
public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1);//w w w. j a v a 2 s.co m } String hostname = args[0]; Integer port = Integer.valueOf(args[1]); SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // socket stream JavaReceiverInputDStream<String> lines = ssc.socketTextStream(hostname, port, StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(SPACE.split(x)); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) throws Exception { // ?SQLContext? SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); DataFrame wordsDataFrame = sqlContext.createDataFrame(rowRDD, JavaRecord.class); // table wordsDataFrame.registerTempTable("words"); // SQLcount DataFrame wordCountsDataFrame = sqlContext .sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
From source file:com.andado.spark.examples.streaming.JavaWordBlacklist.java
License:Apache License
private static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory, String outputPath) {//from w w w .j ava 2 s . c om // If you do not see this printed, that means the StreamingContext has been loaded // from the new checkpoint System.out.println("Creating new context"); final File outputFile = new File(outputPath); if (outputFile.exists()) { outputFile.delete(); } SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount"); // Create the context with a 1 second batch size JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); ssc.checkpoint(checkpointDirectory); // Create a socket stream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() { @Override public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException { // Get or register the blacklist Broadcast final Broadcast<List<String>> blacklist = JavaWordBlacklist .getInstance(new JavaSparkContext(rdd.context())); // Get or register the droppedWordsCounter Accumulator final LongAccumulator droppedWordsCounter = JavaDroppedWordsCounter .getInstance(new JavaSparkContext(rdd.context())); // Use blacklist to drop words and use droppedWordsCounter to count them String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> wordCount) { if (blacklist.value().contains(wordCount._1())) { droppedWordsCounter.add(wordCount._2()); return false; } else { return true; } } }).collect().toString(); String output = "Counts at time " + time + " " + counts; System.out.println(output); System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally"); System.out.println("Appending to " + outputFile.getAbsolutePath()); Files.append(output + "\n", outputFile, Charset.defaultCharset()); } }); return ssc; }
From source file:com.andado.spark.examples.streaming.JavaSqlNetworkWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1);/*from w w w . j ava2 s . com*/ } //StreamingExamples.setStreamingLogLevels(); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); // Convert RDDs of the words DStream to DataFrame and run SQL query words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() { @Override public void call(JavaRDD<String> rdd, Time time) { SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf()); // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() { @Override public JavaRecord call(String word) { JavaRecord record = new JavaRecord(); record.setWord(word); return record; } }); Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class); // Creates a temporary view using the DataFrame wordsDataFrame.createOrReplaceTempView("words"); // Do word count on table using SQL and print it Dataset<Row> wordCountsDataFrame = spark .sql("select word, count(*) as total from words group by word"); System.out.println("========= " + time + "========="); wordCountsDataFrame.show(); } }); ssc.start(); ssc.awaitTermination(); }
From source file:io.hops.examples.spark.kafka.StreamingExample.java
License:Apache License
public static void main(final String[] args) throws Exception { if (args.length < 1) { LOG.log(Level.SEVERE,/* w w w. java2s .co m*/ "Usage: StreamingExample <type> <sink> <topics> \n" + " <type> type of kafka process (producer|consumer).\n" + " <sink> location in hdfs to append streaming output.\n\n"); System.exit(1); } final String type = args[0]; // Create context with a 2 ; batch interval Set<String> topicsSet = new HashSet<>(Hops.getTopics()); SparkConf sparkConf = new SparkConf().setAppName("StreamingExample"); final List<HopsProducer> sparkProducers = new ArrayList<>(); if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("producer")) { JavaSparkContext jsc = new JavaSparkContext(sparkConf); //Create a producer for each topic for (final String topic : topicsSet) { new Thread() { @Override public void run() { try { SparkProducer sparkProducer = Hops.getSparkProducer(topic); sparkProducers.add(sparkProducer); Map<String, String> message = new HashMap<>(); int i = 0; //Produce Kafka messages to topic while (true) { message.put("platform", "HopsWorks"); message.put("program", "SparkKafka-" + topic + "-" + i); sparkProducer.produce(message); Thread.sleep(1000); i++; } } catch (SchemaNotFoundException | CredentialsNotFoundException | InterruptedException ex) { Logger.getLogger(StreamingExample.class.getName()).log(Level.SEVERE, null, ex); } } }.start(); } //Keep application running Hops.shutdownGracefully(jsc); } else { JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); //Use applicationId for sink folder final String appId = jssc.sparkContext().getConf().getAppId(); //Get consumer groups List<String> consumerGroups = Hops.getConsumerGroups(); SparkConsumer consumer = Hops.getSparkConsumer(jssc, topicsSet); // Create direct kafka stream with topics JavaInputDStream<ConsumerRecord<String, byte[]>> messages = consumer.createDirectStream(); //Get the schema for which to consume messages final StringBuilder line = new StringBuilder(); // Get the lines, split them into words, count the words and print JavaDStream<String> lines = messages.map(new Function<ConsumerRecord<String, byte[]>, String>() { @Override public String call(ConsumerRecord<String, byte[]> record) throws SchemaNotFoundException { line.setLength(0); //Parse schema and generate Avro record //For this example, we use a single schema so we get the first record //of the recordInjections map. Otherwise do //recordInjections.get("topic"); GenericRecord genericRecord = recordInjections.entrySet().iterator().next().getValue() .invert(record.value()).get(); line.append(((Utf8) genericRecord.get("platform")).toString()).append(" ") .append(((Utf8) genericRecord.get("program")).toString()); return line.toString(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words .mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); /* * Based on Spark Design patterns * http://spark.apache.org/docs/latest/streaming-programming-guide.html#output-operations-on-dstreams */ wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() { @Override public void call(JavaPairRDD<String, Integer> rdd, Time time) throws Exception { //Keep the latest microbatch output in the file rdd.repartition(1).saveAsHadoopFile(args[1] + "-" + appId, String.class, String.class, TextOutputFormat.class); } }); /* * Enable this to get all the streaming outputs. It creates a folder for * every microbatch slot. * /////////////////////////////////////////////////////////////////////// * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class, * String.class, (Class) TextOutputFormat.class); * /////////////////////////////////////////////////////////////////////// */ // Start the computation jssc.start(); Hops.shutdownGracefully(jssc); } for (HopsProducer hopsProducer : sparkProducers) { hopsProducer.close(); } }
From source file:io.hops.examples.spark.kafka.StreamingKafkaElastic.java
License:Apache License
public static void main(final String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName()); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(10)); //Use applicationId for sink folder final String appId = jssc.sparkContext().getConf().getAppId(); SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate(); //Get consumer groups Properties props = new Properties(); props.put("value.deserializer", StringDeserializer.class.getName()); props.put("client.id", Hops.getJobName()); SparkConsumer consumer = Hops.getSparkConsumer(jssc, props); //Store processed offsets // Create direct kafka stream with topics JavaInputDStream<ConsumerRecord<String, String>> messages = consumer.createDirectStream(); //Convert line to JSON JavaDStream<LogEntryFilebeat> logEntries = messages .map(new Function<ConsumerRecord<String, String>, JSONObject>() { @Override//from w w w. j av a 2 s. com public JSONObject call(ConsumerRecord<String, String> record) throws Exception { return parser(args[0], record.value(), appId); } }).map(new Function<JSONObject, LogEntryFilebeat>() { @Override public LogEntryFilebeat call(JSONObject json) throws Exception { LogEntryFilebeat logEntry = new LogEntryFilebeat( json.getString("message").replace("\n\t", "\n").replace("\n", "---"), json.getString("priority"), json.getString("logger_name"), json.getString("thread"), json.getString("timestamp"), json.getString("file")); return logEntry; } }); logEntries.repartition(1).foreachRDD(new VoidFunction2<JavaRDD<LogEntryFilebeat>, Time>() { @Override public void call(JavaRDD<LogEntryFilebeat> rdd, Time time) throws Exception { Dataset<Row> row = sparkSession.createDataFrame(rdd, LogEntryFilebeat.class); String dataset = "Resources"; if (!rdd.isEmpty()) { LOG.log(Level.INFO, "hops rdd:{0}", rdd.first().getFile()); if (rdd.first().getFile().contains("fiona")) { dataset = "Fiona"; } else if (rdd.first().getFile().contains("shrek")) { dataset = "Shrek"; } DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyyMMdd"); LocalDate localDate = LocalDate.now(); row.write().mode(SaveMode.Append).parquet("/Projects/" + Hops.getProjectName() + "/" + dataset + "/Logs-" + dtf.format(localDate)); } } }); /* * Enable this to get all the streaming outputs. It creates a folder for * every microbatch slot. * /////////////////////////////////////////////////////////////////////// * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class, * String.class, (Class) TextOutputFormat.class); * /////////////////////////////////////////////////////////////////////// */ // Start the computation jssc.start(); Hops.shutdownGracefully(jssc); }
From source file:io.hops.examples.spark.kafka.StreamingLogs.java
License:Apache License
public static void main(final String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName()); JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2)); //Use applicationId for sink folder final String appId = jssc.sparkContext().getConf().getAppId(); SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate(); //Get consumer groups Properties props = new Properties(); props.put("value.deserializer", StringDeserializer.class.getName()); props.put("client.id", Hops.getJobName()); SparkConsumer consumer = Hops.getSparkConsumer(jssc, props); //Store processed offsets // Create direct kafka stream with topics JavaInputDStream<ConsumerRecord<String, String>> messages = consumer.createDirectStream(); //Convert line to JSON JavaDStream<NamenodeLogEntry> logEntries = messages .map(new Function<ConsumerRecord<String, String>, JSONObject>() { @Override//from w w w . j a va 2s . co m public JSONObject call(ConsumerRecord<String, String> record) throws SchemaNotFoundException, MalformedURLException, ProtocolException { LOG.log(Level.INFO, "record:{0}", record); return parser(record.value(), appId); } }).map(new Function<JSONObject, NamenodeLogEntry>() { @Override public NamenodeLogEntry call(JSONObject json) throws SchemaNotFoundException, MalformedURLException, ProtocolException, IOException { NamenodeLogEntry logEntry = new NamenodeLogEntry( json.getString("message").replace("\n\t", "\n").replace("\n", "---"), json.getString("priority"), json.getString("logger_name"), json.getString("timestamp"), json.getString("file")); LOG.log(Level.INFO, "NamenodeLogEntry:{0}", logEntry); return logEntry; } }); //logEntries.print(); logEntries.foreachRDD(new VoidFunction2<JavaRDD<NamenodeLogEntry>, Time>() { @Override public void call(JavaRDD<NamenodeLogEntry> rdd, Time time) throws Exception { Dataset<Row> row = sparkSession.createDataFrame(rdd, NamenodeLogEntry.class); if (!rdd.isEmpty()) { row.write().mode(SaveMode.Append) .parquet("/Projects/" + Hops.getProjectName() + "/Resources/LogAnalysis"); } } }); /* * Enable this to get all the streaming outputs. It creates a folder for * every microbatch slot. * /////////////////////////////////////////////////////////////////////// * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class, * String.class, (Class) TextOutputFormat.class); * /////////////////////////////////////////////////////////////////////// */ // Start the computation jssc.start(); Hops.shutdownGracefully(jssc); }
From source file:org.uma.jmetalsp.application.biobjectivetsp.streamingDataSource.StreamingTwitterTSP.java
License:Open Source License
/** * Create a MultiobjectiveTSPUpdateData from data has been generated by Kafka Server and add into a Map * For each element in the Map, update the problem * @param context//from ww w .j a v a 2 s . c om */ @Override public void start(JavaStreamingContext context) { JavaDStream<Status> tweets = TwitterUtils.createStream(context, twitterAuth, filters); tweets.foreachRDD(new VoidFunction2<JavaRDD<Status>, Time>() { @Override public void call(JavaRDD<Status> statusJavaRDD, Time time) throws Exception { statusJavaRDD.foreach(new VoidFunction<Status>() { @Override public void call(Status status) throws Exception { int type = nextInteger(0, 1); int x = nextInteger(0, 93); int y = nextInteger(0, 93); int value = nextInteger(1, 600); MultiobjectiveTSPUpdateData data = new MultiobjectiveTSPUpdateData(type, x, y, value); problem.update(data); } }); } }); }