List of usage examples for org.apache.spark.api.java.function PairFunction PairFunction
PairFunction
From source file:CacheWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: CacheWordCount <file>"); System.exit(1);/*from w w w . j a v a2 s . c om*/ } SparkConf sparkConf = new SparkConf().setAppName("CacheWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 4); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); //words.cache(); JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); counts.cache(); long num = counts.count(); num = counts.count(); //JavaPairRDD<String, Integer> sorted = counts.sortByKey(); /* List<Tuple2<String, Integer>> output = sorted.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } */ ctx.stop(); }
From source file:ExampleDecisionTreeClassification.java
License:Apache License
public static void main(String[] args) { if (args.length != 1) { System.err.println("Usage: JavaDecisionTree <libsvm format data file>"); System.exit(1);/*from w w w .j av a 2 s.c om*/ } String datapath = args[0]; SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree"); JavaSparkContext sc = new JavaSparkContext(sparkConf); // Load and parse the data file. // Cache the data since we will use it again to compute training error. JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().cache(); // Set parameters. // Empty categoricalFeaturesInfo indicates all features are continuous. Integer numClasses = 2; HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>(); String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 100; // Train a DecisionTree model for classification. final DecisionTreeModel model = DecisionTree.trainClassifier(data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins); // Evaluate model on training instances and compute training error JavaPairRDD<Double, Double> predictionAndLabel = data .mapToPair(new PairFunction<LabeledPoint, Double, Double>() { public Tuple2<Double, Double> call(LabeledPoint p) throws Exception { return new Tuple2<Double, Double>(model.predict(p.features()), p.label()); } }); Double trainErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { public Boolean call(Tuple2<Double, Double> pl) throws Exception { return !pl._1().equals(pl._2()); } }).count() / data.count(); System.out.println("Training error: " + trainErr); System.out.println("Learned classification tree model:\n" + model); }
From source file:JavaKafkaWordCount_old.java
License:Apache License
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount"); sparkConf.setMaster("local[2]"); // Create the context with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000)); int numThreads = 1; String zkQuorum = "localhost:5181"; String group = "test-consumer-group"; Map<String, Integer> topicMap = new HashMap<String, Integer>(); topicMap.put("test", numThreads); JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zkQuorum, group, topicMap);/*from ww w. j a v a 2 s . co m*/ JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); jssc.awaitTermination(); }
From source file:SparkJavaWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1);//from www . ja va 2 s. c o m } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } ctx.stop(); }
From source file:JavaNetworkWordCount.java
License:Apache License
public static void main(String[] args) { if (args.length < 3) { System.err.println("Usage: JavaNetworkWordCount <app name> <code> <refresh rate in seconds>"); System.exit(1);/*from w ww. j av a 2 s .c o m*/ } String name = args[0]; String kernel = args[1]; int refreshRateSeconds = (new Integer(args[2])).intValue(); StreamingExamples.setStreamingLogLevels(); // Create the context with a <refreshRateSeconds> second batch size SparkConf sparkConf = new SparkConf().setAppName(name); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(refreshRateSeconds)); // Create a JavaReceiverInputDStream on target ip:port and count the // words in input stream of \n delimited text (eg. generated by 'nc') // Note that no duplication in storage level only for running locally. // Replication necessary in distributed scenario for fault tolerance. JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(DELIMITER.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
From source file:JavaCustomReceiver.java
License:Apache License
public static void main(String[] args) throws Exception { args = new String[2]; args[0] = "localhost"; args[1] = "12344"; // StreamingExamples.setStreamingLogLevels(); logger.error("hi error"); System.out.println("logging started atleast"); // Create the context with a 1 second batch size SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver").setMaster(SPARK_MASTER) .setSparkHome("local").setJars(new String[] { "target/SparkProject.0.0.1-SNAPSHOT.jar" }); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000)); JavaReceiverInputDStream<String> lines = ssc.receiverStream(new JavaCustomReceiver("localhost", 12344)); System.out.println("Received Lines: " + lines.toString()); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { System.out.println("Flat map processsing " + x); return Arrays.asList(x.split(" ")); }/*from w ww. ja v a 2s . c o m*/ }); JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(1000); ssc.start(); ssc.awaitTermination(1000 * 10); }
From source file:SparkHome.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1);/*ww w . j a v a2s. co m*/ } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 2); // JavaRDD<String> counts=lines.flatMap(new FlatMapFunction<String,String>() { // /** // * // */ // private static final long serialVersionUID = 1L; // // public Iterable<String> call(String s){ // return Arrays.asList(s.split("\\s*,\\s*")); // } // // }); JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String s) { return Arrays.asList(SPACE.split(s)); } }); JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); words.saveAsTextFile("C:\\Users\\i308124\\Desktop\\DB_Comparision_Output\\output_new1.txt"); // List<Tuple2<String, Integer>> output = counts.collect(); // for (Tuple2<?,?> tuple : output) { // System.out.println(tuple._1() + ": " + tuple._2()); // } // // ctx.stop(); }
From source file:Training.java
public static void main(String[] args) { //StreamingExamples.setStreamingLogLevels(); // Set logging level if log4j not configured (override by adding log4j.properties to classpath) String arq = args[0];/*from w w w.j a v a 2s.c o m*/ if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) { Logger.getRootLogger().setLevel(Level.WARN); } SparkConf sparkConf = new SparkConf().setAppName("JavaTwitterHashTagJoinSentiments"); // check Spark configuration for master URL, set it to local if not configured if (!sparkConf.contains("spark.master")) { sparkConf.setMaster("local[2]"); } SparkSession spark = SparkSession.builder().appName("teste2").config(sparkConf).getOrCreate(); Dataset<Row> df = spark.read().json(arq); df.createOrReplaceTempView("Tweet"); TokenizerFactory tokFactory = TwitterTokenizerFactory.getTokFactory(); Dataset<Row> sqlDF = spark.sql("SELECT classifier,text FROM Tweet"); // implementao com ml cujo os resultados ficam sempre dentro do dataset // Tokenizer tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words"); // Dataset<Row> wordsData = tokenizer.transform(sqlDF); // // int numFeatures = 20; // HashingTF hashingTF = new HashingTF() // .setInputCol("words") // .setOutputCol("rawFeatures") // .setNumFeatures(numFeatures); // // Dataset<Row> featurizedData = hashingTF.transform(wordsData); // //// alternatively, CountVectorizer can also be used to get term frequency vectors // IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features"); // IDFModel idfModel = idf.fit(featurizedData); // Dataset<Row> rows = idfModel.transform(featurizedData); // // rows.show(); // JavaRDD<LabeledPoint> data = rows.toJavaRDD().map(f -> new LabeledPoint(f.getString(0).equals("POSITIVE")?1:0,SparseVector.fromML( f.getAs(f.size() - 1)))); HashingTF hashingTF = new HashingTF(1000); //usando idf eu acho que funciona // JavaRDD<Vector> vetores = sqlDF.toJavaRDD().map(f -> hashingTF.transform(Arrays.asList(f.getString(1).split(" ")))); // IDFModel idf = new IDF().fit(vetores); // JavaRDD<LabeledPoint> data = sqlDF.toJavaRDD().map(f -> new LabeledPoint(f.getAs(0).toString().equals("POSITIVE")?1:0, idf.transform(hashingTF.transform(Arrays.asList(f.getString(1).split(" ")))))); //usando s hashingtf JavaRDD<LabeledPoint> data = sqlDF.toJavaRDD().map(new Function<Row, LabeledPoint>() { @Override public LabeledPoint call(Row f) throws Exception { String classifier = f.getString(0); String text = f.getString(1); text = URLRemove.remove(text); double cl = classifier.equals("POSITIVE") ? 1 : 0; return new LabeledPoint(cl, hashingTF.transform( Arrays.asList(tokFactory.tokenizer(text.toCharArray(), 0, text.length()).tokenize() // text.split(" "); ))); } }); JavaRDD<LabeledPoint>[] tmp = data.randomSplit(new double[] { 0.6, 0.4 }); JavaRDD<LabeledPoint> training = tmp[0]; // training set JavaRDD<LabeledPoint> test = tmp[1]; // test set final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); JavaPairRDD<Double, Double> predictionAndLabel = test .mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); } }).count() / (double) test.count(); spark.log().info("accuracy:" + accuracy); // Save and load model model.save(spark.sparkContext(), "Docker/myNaiveBayesModel"); NaiveBayesModel sameModel = NaiveBayesModel.load(spark.sparkContext(), "Docker/myNaiveBayesModel"); }
From source file:NetworkWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaNetworkWordCount <hostname> <port>"); System.exit(1);//w w w. ja va 2 s . c om } SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(3)); JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER); JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() { @Override public Iterator<String> call(String x) { return Arrays.asList(SPACE.split(x)).iterator(); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); ssc.start(); ssc.awaitTermination(); }
From source file:SparkKMer.java
License:Apache License
public static void main(String[] args) throws Exception { //Setup//from ww w . ja v a 2 s .co m SparkConf sparkConf = new SparkConf().setAppName("SparkKMer"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); //Agrument parsing if (args.length < 2) { System.err.println("Usage: SparkKMer <accession> <kmer-length>"); System.exit(1); } final String acc = args[0]; final int KMER_LENGTH = Integer.parseInt(args[1]); //Check accession and split ReadCollection run = gov.nih.nlm.ncbi.ngs.NGS.openReadCollection(acc); long numreads = run.getReadCount(); //Slice the job int chunk = 20000; /** amount of reads per 1 map operation **/ int slices = (int) (numreads / chunk / 1); if (slices == 0) slices = 1; List<LongRange> sub = new ArrayList<LongRange>(); for (long first = 1; first <= numreads;) { long last = first + chunk - 1; if (last > numreads) last = numreads; sub.add(new LongRange(first, last)); first = last + 1; } System.err.println("Prepared ranges: \n" + sub); JavaRDD<LongRange> jobs = jsc.parallelize(sub, slices); //Map // JavaRDD<String> kmers = jobs.flatMap(new FlatMapFunction<LongRange, String>() { ReadCollection run = null; @Override public Iterable<String> call(LongRange s) { //Executes on task nodes List<String> ret = new ArrayList<String>(); try { long first = s.getMinimumLong(); long last = s.getMaximumLong(); if (run == null) { run = gov.nih.nlm.ncbi.ngs.NGS.openReadCollection(acc); } ReadIterator it = run.getReadRange(first, last - first + 1, Read.all); while (it.nextRead()) { //iterate through fragments while (it.nextFragment()) { String bases = it.getFragmentBases(); //iterate through kmers for (int i = 0; i < bases.length() - KMER_LENGTH; i++) { ret.add(bases.substring(i, i + KMER_LENGTH)); } } } } catch (ErrorMsg x) { System.err.println(x.toString()); x.printStackTrace(); } return ret; } }); //Initiate kmer counting; JavaPairRDD<String, Integer> kmer_ones = kmers.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }); //Reduce counts JavaPairRDD<String, Integer> counts = kmer_ones.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); //Collect the output List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<String, Integer> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } jsc.stop(); }