Example usage for org.apache.spark.api.java.function VoidFunction2 VoidFunction2

List of usage examples for org.apache.spark.api.java.function VoidFunction2 VoidFunction2

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function VoidFunction2 VoidFunction2.

Prototype

VoidFunction2

Source Link

Usage

From source file:StreamMain.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 4) {
        System.err.println("Usage: JavaKafkaWordCount <zkQuorum> <group> <topics> <numThreads>");
        System.exit(1);/*from   www.jav a  2s .  c o  m*/
    }

    // StreamingExamples.setStreamingLogLevels();
    SparkConf sparkConf = new SparkConf().setAppName("StreamMain");
    // Create the context with 2 seconds batch size
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(10000));

    int numThreads = Integer.parseInt(args[3]);
    Map<String, Integer> topicMap = new HashMap<>();
    String[] topics = args[2].split(",");
    for (String topic : topics) {
        topicMap.put(topic, numThreads);
    }

    JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, args[0], args[1],
            topicMap);

    JavaDStream<String> lines = messages.map(Tuple2::_2);

    JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(SPACE.split(x)).iterator());

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((i1, i2) -> i1 + i2);

    wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
        @Override
        public void call(JavaPairRDD<String, Integer> stringIntegerJavaPairRDD, Time time) throws Exception {

            List<Tuple2<String, Integer>> output = stringIntegerJavaPairRDD.collect();

            System.out.println("\n>>>RESULTS START\n");

            System.out.println("Output Size : " + output.size());

            output.forEach(t -> System.out.println(t._1() + ":" + t._2()));

            System.out.println("\n>>>RESULTS END\n");
        }
    });
    //wordCounts.foreachRDD();
    jssc.start();

    jssc.awaitTermination();
}

From source file:CollectAndPredict.java

public static void main(String[] args) {

    //StreamingExamples.setStreamingLogLevels();
    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) {
        Logger.getRootLogger().setLevel(Level.WARN);
    }/*from  w  w w  .jav  a 2 s .co  m*/

    String OAuthConsumerKey = "QxeynbXmN93DuNiZKkYfZcS2o";
    String OAuthConsumerSecret = "2rAzjerHeW6sIgeDim0A77iGaRn9O683m0DrTbBhaoIuRRq7oU";
    String OAuthAccessToken = "825094416935297025-jCegwA25yj3QxF2rHeJ5hRoVu86AfaY";
    String OAuthAccessTokenSecret = "CwfNmGcWHoL8qvr5dWDdknYM4k4KvAZc7XlGZuYl2DcR8";
    String[] filters = Arrays.copyOfRange(args, 0, args.length);

    // Set the system properties so that Twitter4j library used by Twitter stream
    // can use them to generate OAuth credentials
    System.setProperty("twitter4j.oauth.consumerKey", OAuthConsumerKey);
    System.setProperty("twitter4j.oauth.consumerSecret", OAuthConsumerSecret);
    System.setProperty("twitter4j.oauth.accessToken", OAuthAccessToken);
    System.setProperty("twitter4j.oauth.accessTokenSecret", OAuthAccessTokenSecret);

    SparkConf sparkConf = new SparkConf().setAppName("JavaTwitterHashTagJoinSentiments");

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
        sparkConf.setMaster("local[2]");
    }
    SparkSession spark = SparkSession.builder().appName("teste2").config(sparkConf).getOrCreate();
    JavaStreamingContext jssc = new JavaStreamingContext(new JavaSparkContext(spark.sparkContext()),
            Seconds.apply(30));

    TokenizerFactory tokFactory = TwitterTokenizerFactory.getTokFactory();

    NaiveBayesModel model = NaiveBayesModel.load(spark.sparkContext(), "Docker/myNaiveBayesModel");
    HashingTF hashingTF = new HashingTF(1000);
    JavaReceiverInputDStream<Status> stream = TwitterUtils.createStream(jssc, filters);
    JavaDStream<Tweet> statuses = stream.map((Status status) -> new Tweet()
            .addUser(new TwitterUser().addID(status.getUser().getId()).addName(status.getUser().getName())
                    .addLocation(status.getUser().getLocation()).addDateSignin(status.getUser().getCreatedAt())
                    .addCountTweets(status.getUser().getStatusesCount())
                    .addCountFavorites(status.getUser().getFavouritesCount())
                    .addCountFriends(status.getUser().getFriendsCount())
                    .addCountFollowers(status.getUser().getFollowersCount()))
            .addText(status.getText()).addID(status.getId()).addDate(status.getCreatedAt())
            .addLatitude(
                    status.getGeoLocation() != null ? status.getGeoLocation().getLatitude() : Double.MAX_VALUE)
            .addLongitude(status.getGeoLocation() != null ? status.getGeoLocation().getLongitude()
                    : Double.MAX_VALUE));

    statuses.foreachRDD(new VoidFunction2<JavaRDD<Tweet>, Time>() {
        long numTweetsCollected = 0;
        long numTweetsToCollect = 200;

        @Override
        public void call(JavaRDD<Tweet> t1, Time t2) throws Exception {
            List<Tweet> collect = t1.collect();

            long count = collect.size();
            if (count > 0) {
                for (Tweet tweet : collect) {
                    String textoSemUrl = URLRemove.remove(tweet.getText());
                    Vector v = hashingTF.transform(Arrays.asList(tokFactory
                            .tokenizer(textoSemUrl.toCharArray(), 0, textoSemUrl.length()).tokenize()));
                    double predict = model.predict(v);
                    if (predict == 1) {
                        tweet.setClassifier("POSITIVE");
                    } else {
                        tweet.setClassifier("NEGATIVE");
                    }
                }
                ObjectWriter ow = new ObjectMapper().writer().withDefaultPrettyPrinter();
                try {
                    ow.writeValue(
                            new FileOutputStream(new File("Docker/Twitter" + t2.milliseconds() + ".json")),
                            collect);
                } catch (Exception ex) {
                    spark.log().error(ex.getMessage(), ex);
                }
                numTweetsCollected += count;
                spark.log().info("coletou :" + numTweetsCollected + " tweets");
                if (numTweetsCollected > numTweetsToCollect) {
                    System.exit(0);
                }
            }
        }
    });
    //        statuses.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {
    //            long numTweetsCollected = 0;
    //            long numTweetsToCollect = 200;
    //
    //            @Override
    //            public void call(JavaRDD<String> rdd, Time time) throws Exception {
    //                long count = rdd.count();
    //                if (count > 0) {
    //                    JavaRDD<String> outputRDD = rdd.repartition(10);
    //                    outputRDD.saveAsTextFile("/Docker/tweets_" + time.milliseconds());
    //                    numTweetsCollected += count;
    //                    if (numTweetsCollected > numTweetsToCollect) {
    //                        System.exit(0);
    //                    }
    //                }
    //            }
    //        });
    //        JavaDStream<String> words = stream.flatMap(new FlatMapFunction<Status, String>() {
    //
    //            @Override
    //            public Iterable<String> call(Status t) throws Exception {
    //                return Arrays.asList(t.getText().split(" "));
    //            }
    //        });
    //
    //        JavaDStream<String> hashTags = words.filter(new Function<String, Boolean>() {
    //            @Override
    //            public Boolean call(String word) {
    //                return word.startsWith("#");
    //            }
    //        });
    //
    //        // Read in the word-sentiment list and create a static RDD from it
    //        String wordSentimentFilePath = "streaming-twitter/examples/data/AFINN-111.txt";
    //        final JavaPairRDD<String, Double> wordSentiments = jssc.sparkContext()
    //                .textFile(wordSentimentFilePath)
    //                .mapToPair(new PairFunction<String, String, Double>() {
    //                    @Override
    //                    public Tuple2<String, Double> call(String line) {
    //                        String[] columns = line.split("\t");
    //                        return new Tuple2<>(columns[0], Double.parseDouble(columns[1]));
    //                    }
    //                });
    //
    //        JavaPairDStream<String, Integer> hashTagCount = hashTags.mapToPair(
    //                new PairFunction<String, String, Integer>() {
    //                    @Override
    //                    public Tuple2<String, Integer> call(String s) {
    //                        // leave out the # character
    //                        return new Tuple2<>(s.substring(1), 1);
    //                    }
    //                });
    //
    //        JavaPairDStream<String, Integer> hashTagTotals = hashTagCount.reduceByKeyAndWindow(
    //                new Function2<Integer, Integer, Integer>() {
    //                    @Override
    //                    public Integer call(Integer a, Integer b) {
    //                        return a + b;
    //                    }
    //                }, new Duration(10000));
    //
    //        // Determine the hash tags with the highest sentiment values by joining the streaming RDD
    //        // with the static RDD inside the transform() method and then multiplying
    //        // the frequency of the hash tag by its sentiment value
    //        JavaPairDStream<String, Tuple2<Double, Integer>> joinedTuples
    //                = hashTagTotals.transformToPair(new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Tuple2<Double, Integer>>>() {
    //                    @Override
    //                    public JavaPairRDD<String, Tuple2<Double, Integer>> call(
    //                            JavaPairRDD<String, Integer> topicCount) {
    //                                return wordSentiments.join(topicCount);
    //                            }
    //                });
    //
    //        JavaPairDStream<String, Double> topicHappiness = joinedTuples.mapToPair(
    //                new PairFunction<Tuple2<String, Tuple2<Double, Integer>>, String, Double>() {
    //                    @Override
    //                    public Tuple2<String, Double> call(Tuple2<String, Tuple2<Double, Integer>> topicAndTuplePair) {
    //                        Tuple2<Double, Integer> happinessAndCount = topicAndTuplePair._2();
    //                        return new Tuple2<>(topicAndTuplePair._1(),
    //                                happinessAndCount._1() * happinessAndCount._2());
    //                    }
    //                });
    //
    //        JavaPairDStream<Double, String> happinessTopicPairs = topicHappiness.mapToPair(
    //                new PairFunction<Tuple2<String, Double>, Double, String>() {
    //                    @Override
    //                    public Tuple2<Double, String> call(Tuple2<String, Double> topicHappiness) {
    //                        return new Tuple2<>(topicHappiness._2(),
    //                                topicHappiness._1());
    //                    }
    //                });
    //
    //        JavaPairDStream<Double, String> happiest10 = happinessTopicPairs.transformToPair(
    //                new Function<JavaPairRDD<Double, String>, JavaPairRDD<Double, String>>() {
    //                    @Override
    //                    public JavaPairRDD<Double, String> call(
    //                            JavaPairRDD<Double, String> happinessAndTopics) {
    //                                return happinessAndTopics.sortByKey(false);
    //                            }
    //                }
    //        );
    //
    //        // Print hash tags with the most positive sentiment values
    //        happiest10.foreachRDD(new VoidFunction<JavaPairRDD<Double, String>>() {
    //            @Override
    //            public void call(JavaPairRDD<Double, String> happinessTopicPairs) {
    //                List<Tuple2<Double, String>> topList = happinessTopicPairs.take(10);
    //                System.out.println(
    //                        String.format("\nHappiest topics in last 10 seconds (%s total):",
    //                                happinessTopicPairs.count()));
    //                for (Tuple2<Double, String> pair : topList) {
    //                    System.out.println(
    //                            String.format("%s (%s happiness)", pair._2(), pair._1()));
    //                }
    //            }
    //        });

    jssc.start();

    try {
        jssc.awaitTermination();
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
}

From source file:cn.com.warlock.streaming.JavaRecoverableNetworkWordCount.java

License:Apache License

private static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory) {

    // ? StreamingContext  checkpoint 
    System.out.println("Creating new context");

    SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount");
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(10));
    ssc.checkpoint(checkpointDirectory);

    //  socket stream
    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port);
    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override/*  w  ww  .j av a  2 s  .co  m*/
        public Iterable<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x));
        }
    });
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
        @Override
        public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException {
            List<Tuple2<String, Integer>> list = rdd.collect();
            Jedis jedis = null;
            try {
                jedis = JedisPoolHolder.getInstance().getResource();
                for (Tuple2<String, Integer> tuple2 : list) {
                    // ??redis???"word_"
                    jedis.incrBy("word_" + tuple2._1(), tuple2._2());
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (jedis != null) {
                    jedis.close();
                }
            }
        }
    });

    return ssc;
}

From source file:cn.com.warlock.streaming.JavaSqlNetworkWordCount.java

License:Apache License

public static void main(String[] args) {
    if (args.length < 2) {
        System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
        System.exit(1);//w  w w.  j a v a 2  s.co m
    }

    String hostname = args[0];
    Integer port = Integer.valueOf(args[1]);

    SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount");
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    // socket stream
    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(hostname, port,
            StorageLevels.MEMORY_AND_DISK_SER);
    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x));
        }
    });

    // Convert RDDs of the words DStream to DataFrame and run SQL query
    words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {

        @Override
        public void call(JavaRDD<String> rdd, Time time) throws Exception {
            // ?SQLContext?
            SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context());

            // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame
            JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() {
                @Override
                public JavaRecord call(String word) {
                    JavaRecord record = new JavaRecord();
                    record.setWord(word);
                    return record;
                }
            });
            DataFrame wordsDataFrame = sqlContext.createDataFrame(rowRDD, JavaRecord.class);

            //  table
            wordsDataFrame.registerTempTable("words");

            // SQLcount
            DataFrame wordCountsDataFrame = sqlContext
                    .sql("select word, count(*) as total from words group by word");
            System.out.println("========= " + time + "=========");
            wordCountsDataFrame.show();
        }
    });

    ssc.start();
    ssc.awaitTermination();
}

From source file:com.andado.spark.examples.streaming.JavaWordBlacklist.java

License:Apache License

private static JavaStreamingContext createContext(String ip, int port, String checkpointDirectory,
        String outputPath) {//from   w w w  .j ava  2 s  .  c  om

    // If you do not see this printed, that means the StreamingContext has been loaded
    // from the new checkpoint
    System.out.println("Creating new context");
    final File outputFile = new File(outputPath);
    if (outputFile.exists()) {
        outputFile.delete();
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaRecoverableNetworkWordCount");
    // Create the context with a 1 second batch size
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
    ssc.checkpoint(checkpointDirectory);

    // Create a socket stream on target ip:port and count the
    // words in input stream of \n delimited text (eg. generated by 'nc')
    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(ip, port);
    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Arrays.asList(SPACE.split(x)).iterator();
        }
    });
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
        @Override
        public void call(JavaPairRDD<String, Integer> rdd, Time time) throws IOException {
            // Get or register the blacklist Broadcast
            final Broadcast<List<String>> blacklist = JavaWordBlacklist
                    .getInstance(new JavaSparkContext(rdd.context()));
            // Get or register the droppedWordsCounter Accumulator
            final LongAccumulator droppedWordsCounter = JavaDroppedWordsCounter
                    .getInstance(new JavaSparkContext(rdd.context()));
            // Use blacklist to drop words and use droppedWordsCounter to count them
            String counts = rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() {
                @Override
                public Boolean call(Tuple2<String, Integer> wordCount) {
                    if (blacklist.value().contains(wordCount._1())) {
                        droppedWordsCounter.add(wordCount._2());
                        return false;
                    } else {
                        return true;
                    }
                }
            }).collect().toString();
            String output = "Counts at time " + time + " " + counts;
            System.out.println(output);
            System.out.println("Dropped " + droppedWordsCounter.value() + " word(s) totally");
            System.out.println("Appending to " + outputFile.getAbsolutePath());
            Files.append(output + "\n", outputFile, Charset.defaultCharset());
        }
    });

    return ssc;
}

From source file:com.andado.spark.examples.streaming.JavaSqlNetworkWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
        System.exit(1);/*from  w  w  w  .  j ava2  s .  com*/
    }

    //StreamingExamples.setStreamingLogLevels();

    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("JavaSqlNetworkWordCount");
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));

    // Create a JavaReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited text (eg. generated by 'nc')
    // Note that no duplication in storage level only for running locally.
    // Replication necessary in distributed scenario for fault tolerance.
    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]),
            StorageLevels.MEMORY_AND_DISK_SER);
    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Arrays.asList(SPACE.split(x)).iterator();
        }
    });

    // Convert RDDs of the words DStream to DataFrame and run SQL query
    words.foreachRDD(new VoidFunction2<JavaRDD<String>, Time>() {
        @Override
        public void call(JavaRDD<String> rdd, Time time) {
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

            // Convert JavaRDD[String] to JavaRDD[bean class] to DataFrame
            JavaRDD<JavaRecord> rowRDD = rdd.map(new Function<String, JavaRecord>() {
                @Override
                public JavaRecord call(String word) {
                    JavaRecord record = new JavaRecord();
                    record.setWord(word);
                    return record;
                }
            });
            Dataset<Row> wordsDataFrame = spark.createDataFrame(rowRDD, JavaRecord.class);

            // Creates a temporary view using the DataFrame
            wordsDataFrame.createOrReplaceTempView("words");

            // Do word count on table using SQL and print it
            Dataset<Row> wordCountsDataFrame = spark
                    .sql("select word, count(*) as total from words group by word");
            System.out.println("========= " + time + "=========");
            wordCountsDataFrame.show();
        }
    });

    ssc.start();
    ssc.awaitTermination();
}

From source file:io.hops.examples.spark.kafka.StreamingExample.java

License:Apache License

public static void main(final String[] args) throws Exception {
    if (args.length < 1) {
        LOG.log(Level.SEVERE,/*  w w w.  java2s  .co m*/
                "Usage: StreamingExample <type> <sink> <topics> \n"
                        + "  <type> type of kafka process (producer|consumer).\n"
                        + "  <sink> location in hdfs to append streaming output.\n\n");
        System.exit(1);
    }

    final String type = args[0];
    // Create context with a 2 ; batch interval
    Set<String> topicsSet = new HashSet<>(Hops.getTopics());
    SparkConf sparkConf = new SparkConf().setAppName("StreamingExample");
    final List<HopsProducer> sparkProducers = new ArrayList<>();

    if (!Strings.isNullOrEmpty(type) && type.equalsIgnoreCase("producer")) {
        JavaSparkContext jsc = new JavaSparkContext(sparkConf);
        //Create a producer for each topic
        for (final String topic : topicsSet) {
            new Thread() {
                @Override
                public void run() {
                    try {
                        SparkProducer sparkProducer = Hops.getSparkProducer(topic);
                        sparkProducers.add(sparkProducer);
                        Map<String, String> message = new HashMap<>();
                        int i = 0;
                        //Produce Kafka messages to topic
                        while (true) {
                            message.put("platform", "HopsWorks");
                            message.put("program", "SparkKafka-" + topic + "-" + i);
                            sparkProducer.produce(message);
                            Thread.sleep(1000);
                            i++;
                        }
                    } catch (SchemaNotFoundException | CredentialsNotFoundException | InterruptedException ex) {
                        Logger.getLogger(StreamingExample.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }.start();
        } //Keep application running
        Hops.shutdownGracefully(jsc);
    } else {
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
        //Use applicationId for sink folder
        final String appId = jssc.sparkContext().getConf().getAppId();

        //Get consumer groups
        List<String> consumerGroups = Hops.getConsumerGroups();
        SparkConsumer consumer = Hops.getSparkConsumer(jssc, topicsSet);
        // Create direct kafka stream with topics
        JavaInputDStream<ConsumerRecord<String, byte[]>> messages = consumer.createDirectStream();

        //Get the schema for which to consume messages
        final StringBuilder line = new StringBuilder();

        // Get the lines, split them into words, count the words and print
        JavaDStream<String> lines = messages.map(new Function<ConsumerRecord<String, byte[]>, String>() {
            @Override
            public String call(ConsumerRecord<String, byte[]> record) throws SchemaNotFoundException {
                line.setLength(0);
                //Parse schema and generate Avro record
                //For this example, we use a single schema so we get the first record
                //of the recordInjections map. Otherwise do
                //recordInjections.get("topic");
                GenericRecord genericRecord = recordInjections.entrySet().iterator().next().getValue()
                        .invert(record.value()).get();
                line.append(((Utf8) genericRecord.get("platform")).toString()).append(" ")
                        .append(((Utf8) genericRecord.get("program")).toString());
                return line.toString();
            }
        });

        JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
            @Override
            public Iterator<String> call(String x) {
                return Arrays.asList(SPACE.split(x)).iterator();
            }
        });

        JavaPairDStream<String, Integer> wordCounts = words
                .mapToPair(new PairFunction<String, String, Integer>() {
                    @Override
                    public Tuple2<String, Integer> call(String s) {
                        return new Tuple2<>(s, 1);
                    }
                }).reduceByKey(new Function2<Integer, Integer, Integer>() {
                    @Override
                    public Integer call(Integer i1, Integer i2) {
                        return i1 + i2;
                    }
                });

        wordCounts.print();

        /*
         * Based on Spark Design patterns
         * http://spark.apache.org/docs/latest/streaming-programming-guide.html#output-operations-on-dstreams
         */
        wordCounts.foreachRDD(new VoidFunction2<JavaPairRDD<String, Integer>, Time>() {
            @Override
            public void call(JavaPairRDD<String, Integer> rdd, Time time) throws Exception {
                //Keep the latest microbatch output in the file
                rdd.repartition(1).saveAsHadoopFile(args[1] + "-" + appId, String.class, String.class,
                        TextOutputFormat.class);
            }

        });

        /*
         * Enable this to get all the streaming outputs. It creates a folder for
         * every microbatch slot.
         * ///////////////////////////////////////////////////////////////////////
         * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class,
         * String.class, (Class) TextOutputFormat.class);
         * ///////////////////////////////////////////////////////////////////////
         */
        // Start the computation
        jssc.start();
        Hops.shutdownGracefully(jssc);
    }
    for (HopsProducer hopsProducer : sparkProducers) {
        hopsProducer.close();
    }
}

From source file:io.hops.examples.spark.kafka.StreamingKafkaElastic.java

License:Apache License

public static void main(final String[] args) throws Exception {

    SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName());
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(10));

    //Use applicationId for sink folder
    final String appId = jssc.sparkContext().getConf().getAppId();
    SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
    //Get consumer groups
    Properties props = new Properties();
    props.put("value.deserializer", StringDeserializer.class.getName());
    props.put("client.id", Hops.getJobName());
    SparkConsumer consumer = Hops.getSparkConsumer(jssc, props);
    //Store processed offsets

    // Create direct kafka stream with topics
    JavaInputDStream<ConsumerRecord<String, String>> messages = consumer.createDirectStream();

    //Convert line to JSON
    JavaDStream<LogEntryFilebeat> logEntries = messages
            .map(new Function<ConsumerRecord<String, String>, JSONObject>() {
                @Override//from   w  w  w. j av  a 2  s. com
                public JSONObject call(ConsumerRecord<String, String> record) throws Exception {
                    return parser(args[0], record.value(), appId);
                }
            }).map(new Function<JSONObject, LogEntryFilebeat>() {
                @Override
                public LogEntryFilebeat call(JSONObject json) throws Exception {
                    LogEntryFilebeat logEntry = new LogEntryFilebeat(
                            json.getString("message").replace("\n\t", "\n").replace("\n", "---"),
                            json.getString("priority"), json.getString("logger_name"), json.getString("thread"),
                            json.getString("timestamp"), json.getString("file"));
                    return logEntry;
                }
            });

    logEntries.repartition(1).foreachRDD(new VoidFunction2<JavaRDD<LogEntryFilebeat>, Time>() {
        @Override
        public void call(JavaRDD<LogEntryFilebeat> rdd, Time time) throws Exception {
            Dataset<Row> row = sparkSession.createDataFrame(rdd, LogEntryFilebeat.class);
            String dataset = "Resources";
            if (!rdd.isEmpty()) {
                LOG.log(Level.INFO, "hops rdd:{0}", rdd.first().getFile());
                if (rdd.first().getFile().contains("fiona")) {
                    dataset = "Fiona";
                } else if (rdd.first().getFile().contains("shrek")) {
                    dataset = "Shrek";
                }

                DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyyMMdd");
                LocalDate localDate = LocalDate.now();
                row.write().mode(SaveMode.Append).parquet("/Projects/" + Hops.getProjectName() + "/" + dataset
                        + "/Logs-" + dtf.format(localDate));
            }
        }
    });
    /*
     * Enable this to get all the streaming outputs. It creates a folder for
     * every microbatch slot.
     * ///////////////////////////////////////////////////////////////////////
     * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class,
     * String.class, (Class) TextOutputFormat.class);
     * ///////////////////////////////////////////////////////////////////////
     */
    // Start the computation
    jssc.start();
    Hops.shutdownGracefully(jssc);
}

From source file:io.hops.examples.spark.kafka.StreamingLogs.java

License:Apache License

public static void main(final String[] args) throws Exception {

    SparkConf sparkConf = new SparkConf().setAppName(Hops.getJobName());
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));

    //Use applicationId for sink folder
    final String appId = jssc.sparkContext().getConf().getAppId();
    SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
    //Get consumer groups
    Properties props = new Properties();
    props.put("value.deserializer", StringDeserializer.class.getName());
    props.put("client.id", Hops.getJobName());
    SparkConsumer consumer = Hops.getSparkConsumer(jssc, props);
    //Store processed offsets

    // Create direct kafka stream with topics
    JavaInputDStream<ConsumerRecord<String, String>> messages = consumer.createDirectStream();

    //Convert line to JSON
    JavaDStream<NamenodeLogEntry> logEntries = messages
            .map(new Function<ConsumerRecord<String, String>, JSONObject>() {
                @Override//from w w w  .  j  a va  2s  . co  m
                public JSONObject call(ConsumerRecord<String, String> record)
                        throws SchemaNotFoundException, MalformedURLException, ProtocolException {
                    LOG.log(Level.INFO, "record:{0}", record);
                    return parser(record.value(), appId);
                }
            }).map(new Function<JSONObject, NamenodeLogEntry>() {
                @Override
                public NamenodeLogEntry call(JSONObject json)
                        throws SchemaNotFoundException, MalformedURLException, ProtocolException, IOException {
                    NamenodeLogEntry logEntry = new NamenodeLogEntry(
                            json.getString("message").replace("\n\t", "\n").replace("\n", "---"),
                            json.getString("priority"), json.getString("logger_name"),
                            json.getString("timestamp"), json.getString("file"));
                    LOG.log(Level.INFO, "NamenodeLogEntry:{0}", logEntry);
                    return logEntry;
                }
            });

    //logEntries.print();
    logEntries.foreachRDD(new VoidFunction2<JavaRDD<NamenodeLogEntry>, Time>() {
        @Override
        public void call(JavaRDD<NamenodeLogEntry> rdd, Time time) throws Exception {
            Dataset<Row> row = sparkSession.createDataFrame(rdd, NamenodeLogEntry.class);
            if (!rdd.isEmpty()) {
                row.write().mode(SaveMode.Append)
                        .parquet("/Projects/" + Hops.getProjectName() + "/Resources/LogAnalysis");
            }
        }
    });
    /*
     * Enable this to get all the streaming outputs. It creates a folder for
     * every microbatch slot.
     * ///////////////////////////////////////////////////////////////////////
     * wordCounts.saveAsHadoopFiles(args[1], "txt", String.class,
     * String.class, (Class) TextOutputFormat.class);
     * ///////////////////////////////////////////////////////////////////////
     */
    // Start the computation
    jssc.start();
    Hops.shutdownGracefully(jssc);
}

From source file:org.uma.jmetalsp.application.biobjectivetsp.streamingDataSource.StreamingTwitterTSP.java

License:Open Source License

/**
 * Create a MultiobjectiveTSPUpdateData from data has been generated by Kafka Server and add into a Map
 * For each element in the Map, update the problem
 * @param context//from ww w  .j a v a  2 s  . c  om
 */
@Override
public void start(JavaStreamingContext context) {
    JavaDStream<Status> tweets = TwitterUtils.createStream(context, twitterAuth, filters);
    tweets.foreachRDD(new VoidFunction2<JavaRDD<Status>, Time>() {
        @Override
        public void call(JavaRDD<Status> statusJavaRDD, Time time) throws Exception {
            statusJavaRDD.foreach(new VoidFunction<Status>() {
                @Override
                public void call(Status status) throws Exception {
                    int type = nextInteger(0, 1);
                    int x = nextInteger(0, 93);
                    int y = nextInteger(0, 93);
                    int value = nextInteger(1, 600);
                    MultiobjectiveTSPUpdateData data = new MultiobjectiveTSPUpdateData(type, x, y, value);
                    problem.update(data);
                }
            });
        }
    });
}