List of usage examples for org.apache.spark.api.java.function FlatMapFunction FlatMapFunction
FlatMapFunction
From source file:CacheWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: CacheWordCount <file>"); System.exit(1);//from w ww. jav a 2 s . c o m } SparkConf sparkConf = new SparkConf().setAppName("CacheWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 4); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); //words.cache(); JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); counts.cache(); long num = counts.count(); num = counts.count(); //JavaPairRDD<String, Integer> sorted = counts.sortByKey(); /* List<Tuple2<String, Integer>> output = sorted.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } */ ctx.stop(); }
From source file:JavaKafkaWordCount_old.java
License:Apache License
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount"); sparkConf.setMaster("local[2]"); // Create the context with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000)); int numThreads = 1; String zkQuorum = "localhost:5181"; String group = "test-consumer-group"; Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put("test", numThreads); JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);/*from www . j av a2s. c o m*/ JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); jssc.awaitTermination(); }
From source file:SparkJavaWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1);/* w ww . ja v a2s . c om*/ } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } ctx.stop(); }
From source file:JavaNetworkWordCount.java
License:Apache License
public static void main(String[] args) { if (args.length < 3) { System.err.println("Usage: JavaNetworkWordCount <app name> <code> <refresh rate in seconds>"); System.exit(1);// www .j a va 2s. com } String name = args[0]; String kernel = args[1]; int refreshRateSeconds = (new Integer(args[2])).intValue(); StreamingExamples.setStreamingLogLevels(); // Create the context with a <refreshRateSeconds> second batch size SparkConf sparkConf = new SparkConf().setAppName(name); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(refreshRateSeconds)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(DELIMITER.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
From source file:JavaCustomReceiver.java
License:Apache License
public static void main(String[] args) throws Exception { args = new String[2]; args[0] = "localhost"; args[1] = "12344"; // StreamingExamples.setStreamingLogLevels(); logger.error("hi error"); System.out.println("logging started atleast"); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver").setMaster(SPARK_MASTER) .setSparkHome("local").setJars(new String[] { "target/SparkProject.0.0.1-SNAPSHOT.jar" }); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); JavaReceiverInputDStream<String> lines = ssc.receiverStream(new JavaCustomReceiver("localhost", 12344)); System.out.println("Received Lines: " + lines.toString()); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { System.out.println("Flat map processsing " + x); return Arrays.asList(x.split(" ")); }//from www . j av a 2 s . c om }); JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(1000); ssc.start(); ssc.awaitTermination(1000 * 10); }
From source file:SparkHome.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1);/* w w w .ja va 2 s . com*/ } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 2); // JavaRDD<String> counts=lines.flatMap(new FlatMapFunction<String,String>() { // /** // * // */ // private static final long serialVersionUID = 1L; // // public Iterable<String> call(String s){ // return Arrays.asList(s.split("\\s*,\\s*")); // } // // }); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); words.saveAsTextFile("C:\\Users\\i308124\\Desktop\\DB_Comparision_Output\\output_new1.txt"); // List<Tuple2<String, Integer>> output = counts.collect(); // for (Tuple2<?,?> tuple : output) { // System.out.println(tuple._1() + ": " + tuple._2()); // } // // ctx.stop(); }
From source file:SparkHome.java
License:Apache License
public static void mySparkCode() { JavaSparkContext spark = new JavaSparkContext("localhost", "sparkwordcount"); //======Using flatMap(RDD of words)============== JavaRDD<String> csvData = spark.textFile("C:\\Users\\i308124\\Desktop\\DB_Comparision_Output\\input.txt"); JavaRDD<String> counts = csvData.flatMap(new FlatMapFunction<String, String>() { /**/*from w w w .ja v a 2 s . c o m*/ * */ private static final long serialVersionUID = 1L; public Iterable<String> call(String s) { return Arrays.asList(s.split("\\s*,\\s*")); } }); counts.saveAsTextFile("C:\\Users\\i308124\\Desktop\\DB_Comparision_Output\\output.txt"); }
From source file:SparkClosestPair.java
License:Apache License
public static void closestPairFactory(JavaSparkContext ctx) { JavaRDD<String> lines = ctx.textFile("hdfs://master:54310/user/closestpair/cl_1.csv"); JavaRDD<ClosestPairPoint> points = lines .mapPartitions(new FlatMapFunction<Iterator<String>, ClosestPairPoint>() { public Iterable<ClosestPairPoint> call(Iterator<String> stringIterator) throws Exception { ArrayList<ClosestPairPoint> pointList = new ArrayList<ClosestPairPoint>(); while (stringIterator.hasNext()) { String nextString = stringIterator.next(); String[] coords = nextString.split(","); ClosestPairPoint p = new ClosestPairPoint(Double.parseDouble(coords[0]), Double.parseDouble(coords[1])); pointList.add(p); }// www . java2 s . co m System.out.println(pointList.size()); ClosestPair cP = ClosestPair.findClosestPair(pointList); ClosestPairPoint leftPoint = cP.getLeftPoint(); ClosestPairPoint rightPoint = cP.getRightPoint(); System.out.println("Closest Pair = (" + leftPoint.getxCoord() + "," + leftPoint.getyCoord() + ")" + " (" + rightPoint.getxCoord() + "," + rightPoint.getyCoord() + ")"); double xDiff = leftPoint.getxCoord() - rightPoint.getxCoord(); double yDiff = leftPoint.getyCoord() - rightPoint.getyCoord(); double delta = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff)); System.out.println(delta); return ConvexHullBuffer.findBoundaryPoints(pointList, delta, cP); } }); List<ClosestPairPoint> output = points.collect(); System.out.println(output.size()); JavaRDD<ClosestPairPoint> finalPoints = ctx.parallelize(output).repartition(1); JavaRDD<ClosestPairPoint> closestPair = finalPoints .mapPartitions(new FlatMapFunction<Iterator<ClosestPairPoint>, ClosestPairPoint>() { public Iterable<ClosestPairPoint> call(Iterator<ClosestPairPoint> pointIterator) throws Exception { ArrayList<ClosestPairPoint> pointList = new ArrayList<ClosestPairPoint>(); while (pointIterator.hasNext()) { pointList.add(pointIterator.next()); } ClosestPair cP = ClosestPair.findClosestPair(pointList); ClosestPairPoint leftPoint = cP.getLeftPoint(); ClosestPairPoint rightPoint = cP.getRightPoint(); ArrayList<ClosestPairPoint> closestPair = new ArrayList<ClosestPairPoint>(); closestPair.add(leftPoint); closestPair.add(rightPoint); return closestPair; } }); List<ClosestPairPoint> closesPairPoints = closestPair.collect(); closestPair.saveAsTextFile("hdfs://master:54310/user/output/closestpair"); for (ClosestPairPoint p : closesPairPoints) { System.out.println(p.getxCoord() + "," + p.getyCoord()); } ctx.stop(); }
From source file:NetworkWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1);/*from www . ja v a 2s .com*/ } SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(3)); JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
From source file:SparkKMer.java
License:Apache License
public static void main(String[] args) throws Exception { //Setup/*from ww w . ja v a2 s .com*/ SparkConf sparkConf = new SparkConf().setAppName("SparkKMer"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); //Agrument parsing if (args.length < 2) { System.err.println("Usage: SparkKMer <accession> <kmer-length>"); System.exit(1); } final String acc = args[0]; final int KMER_LENGTH = Integer.parseInt(args[1]); //Check accession and split ReadCollection run = gov.nih.nlm.ncbi.ngs.NGS.openReadCollection(acc); long numreads = run.getReadCount(); //Slice the job int chunk = 20000; /** amount of reads per 1 map operation **/ int slices = (int) (numreads / chunk / 1); if (slices == 0) slices = 1; List<LongRange> sub = new ArrayList<LongRange>(); for (long first = 1; first <= numreads;) { long last = first + chunk - 1; if (last > numreads) last = numreads; sub.add(new LongRange(first, last)); first = last + 1; } System.err.println("Prepared ranges: \n" + sub); JavaRDD<LongRange> jobs = jsc.parallelize(sub, slices); //Map // JavaRDD<String> kmers = jobs.flatMap(new FlatMapFunction<LongRange, String>() { ReadCollection run = null; @Override public Iterable<String> call(LongRange s) { //Executes on task nodes List<String> ret = new ArrayList<String>(); try { long first = s.getMinimumLong(); long last = s.getMaximumLong(); if (run == null) { run = gov.nih.nlm.ncbi.ngs.NGS.openReadCollection(acc); } ReadIterator it = run.getReadRange(first, last - first + 1, Read.all); while (it.nextRead()) { //iterate through fragments while (it.nextFragment()) { String bases = it.getFragmentBases(); //iterate through kmers for (int i = 0; i < bases.length() - KMER_LENGTH; i++) { ret.add(bases.substring(i, i + KMER_LENGTH)); } } } } catch (ErrorMsg x) { System.err.println(x.toString()); x.printStackTrace(); } return ret; } }); //Initiate kmer counting; JavaPairRDD<String, Integer> kmer_ones = kmers.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); //Reduce counts JavaPairRDD<String, Integer> counts = kmer_ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); //Collect the output List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<String, Integer> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } jsc.stop(); }