Example usage for org.apache.spark.api.java.function PairFunction PairFunction

List of usage examples for org.apache.spark.api.java.function PairFunction PairFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFunction PairFunction.

Prototype

PairFunction

Source Link

Usage

From source file:CacheWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length < 1) {
        System.err.println("Usage: CacheWordCount <file>");
        System.exit(1);/*from  w w w . j  a  v a2  s  . c  om*/
    }

    SparkConf sparkConf = new SparkConf().setAppName("CacheWordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 4);

    JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String s) {
            return Arrays.asList(SPACE.split(s));
        }
    });
    //words.cache();

    JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    });

    JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });
    counts.cache();

    long num = counts.count();
    num = counts.count();

    //JavaPairRDD<String, Integer> sorted = counts.sortByKey();

    /*
    List<Tuple2<String, Integer>> output = sorted.collect();
    for (Tuple2<?,?> tuple : output) {
      System.out.println(tuple._1() + ": " + tuple._2());
    }
    */
    ctx.stop();
}

From source file:ExampleDecisionTreeClassification.java

License:Apache License

public static void main(String[] args) {
    if (args.length != 1) {
        System.err.println("Usage: JavaDecisionTree <libsvm format data file>");
        System.exit(1);/*from  w w  w  .j av  a 2  s.c om*/
    }
    String datapath = args[0];
    SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Load and parse the data file.
    // Cache the data since we will use it again to compute training error.
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().cache();

    // Set parameters.
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    Integer numClasses = 2;
    HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();
    String impurity = "gini";
    Integer maxDepth = 5;
    Integer maxBins = 100;

    // Train a DecisionTree model for classification.
    final DecisionTreeModel model = DecisionTree.trainClassifier(data, numClasses, categoricalFeaturesInfo,
            impurity, maxDepth, maxBins);

    // Evaluate model on training instances and compute training error
    JavaPairRDD<Double, Double> predictionAndLabel = data
            .mapToPair(new PairFunction<LabeledPoint, Double, Double>() {

                public Tuple2<Double, Double> call(LabeledPoint p) throws Exception {
                    return new Tuple2<Double, Double>(model.predict(p.features()), p.label());
                }

            });
    Double trainErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {

        public Boolean call(Tuple2<Double, Double> pl) throws Exception {
            return !pl._1().equals(pl._2());
        }

    }).count() / data.count();
    System.out.println("Training error: " + trainErr);
    System.out.println("Learned classification tree model:\n" + model);
}

From source file:JavaKafkaWordCount_old.java

License:Apache License

public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaKafkaWordCount");
    sparkConf.setMaster("local[2]");
    // Create the context with a 1 second batch size
    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));

    int numThreads = 1;
    String zkQuorum = "localhost:5181";
    String group = "test-consumer-group";
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    topicMap.put("test", numThreads);

    JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zkQuorum, group,
            topicMap);/*from   ww w. j a  v  a  2 s . co  m*/

    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
        @Override
        public String call(Tuple2<String, String> tuple2) {
            return tuple2._2();
        }
    });

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String x) {
            return Lists.newArrayList(SPACE.split(x));
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    jssc.start();
    jssc.awaitTermination();
}

From source file:SparkJavaWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
        System.exit(1);//from www .  ja va 2 s. c  o  m
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String s) {
            return Arrays.asList(SPACE.split(s));
        }
    });

    JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    });

    JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + ": " + tuple._2());
    }
    ctx.stop();
}

From source file:JavaNetworkWordCount.java

License:Apache License

public static void main(String[] args) {
    if (args.length < 3) {
        System.err.println("Usage: JavaNetworkWordCount <app name> <code> <refresh rate in seconds>");
        System.exit(1);/*from  w ww. j  av  a  2 s .c  o  m*/
    }

    String name = args[0];
    String kernel = args[1];
    int refreshRateSeconds = (new Integer(args[2])).intValue();

    StreamingExamples.setStreamingLogLevels();

    // Create the context with a <refreshRateSeconds> second batch size
    SparkConf sparkConf = new SparkConf().setAppName(name);
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(refreshRateSeconds));

    // Create a JavaReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited text (eg. generated by 'nc')
    // Note that no duplication in storage level only for running locally.
    // Replication necessary in distributed scenario for fault tolerance.
    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]),
            StorageLevels.MEMORY_AND_DISK_SER);
    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String x) {
            return Lists.newArrayList(DELIMITER.split(x));
        }
    });
    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    ssc.start();
    ssc.awaitTermination();
}

From source file:JavaCustomReceiver.java

License:Apache License

public static void main(String[] args) throws Exception {
    args = new String[2];
    args[0] = "localhost";
    args[1] = "12344";
    // StreamingExamples.setStreamingLogLevels();
    logger.error("hi error");
    System.out.println("logging started atleast");
    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver").setMaster(SPARK_MASTER)
            .setSparkHome("local").setJars(new String[] { "target/SparkProject.0.0.1-SNAPSHOT.jar" });
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(1000));

    JavaReceiverInputDStream<String> lines = ssc.receiverStream(new JavaCustomReceiver("localhost", 12344));
    System.out.println("Received Lines: " + lines.toString());

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        public Iterable<String> call(String x) {
            System.out.println("Flat map processsing " + x);
            return Arrays.asList(x.split(" "));
        }/*from   w  ww. ja  v  a  2s  . c o m*/
    });
    JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    });

    JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print(1000);
    ssc.start();
    ssc.awaitTermination(1000 * 10);

}

From source file:SparkHome.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length < 1) {
        System.err.println("Usage: JavaWordCount <file>");
        System.exit(1);/*ww w  . j  a  v  a2s.  co m*/
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = ctx.textFile(args[0], 2);

    //    JavaRDD<String> counts=lines.flatMap(new FlatMapFunction<String,String>() {
    //         /**
    //        * 
    //        */
    //       private static final long serialVersionUID = 1L;
    //
    //       public Iterable<String> call(String s){
    //            return Arrays.asList(s.split("\\s*,\\s*"));
    //         }
    //
    //    }); 

    JavaRDD<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterable<String> call(String s) {
            return Arrays.asList(SPACE.split(s));
        }
    });

    JavaPairRDD<String, Integer> ones = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    });

    JavaPairRDD<String, Integer> counts = ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    words.saveAsTextFile("C:\\Users\\i308124\\Desktop\\DB_Comparision_Output\\output_new1.txt");

    //    List<Tuple2<String, Integer>> output = counts.collect();
    //    for (Tuple2<?,?> tuple : output) {
    //      System.out.println(tuple._1() + ": " + tuple._2());
    //    }
    //    
    //    
    ctx.stop();
}

From source file:Training.java

public static void main(String[] args) {
    //StreamingExamples.setStreamingLogLevels();
    // Set logging level if log4j not configured (override by adding log4j.properties to classpath)
    String arq = args[0];/*from   w  w w.j  a  v  a 2s.c  o  m*/
    if (!Logger.getRootLogger().getAllAppenders().hasMoreElements()) {
        Logger.getRootLogger().setLevel(Level.WARN);
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaTwitterHashTagJoinSentiments");

    // check Spark configuration for master URL, set it to local if not configured
    if (!sparkConf.contains("spark.master")) {
        sparkConf.setMaster("local[2]");
    }
    SparkSession spark = SparkSession.builder().appName("teste2").config(sparkConf).getOrCreate();

    Dataset<Row> df = spark.read().json(arq);
    df.createOrReplaceTempView("Tweet");

    TokenizerFactory tokFactory = TwitterTokenizerFactory.getTokFactory();

    Dataset<Row> sqlDF = spark.sql("SELECT classifier,text FROM Tweet");
    // implementao com ml cujo os resultados ficam sempre dentro do dataset       
    //        Tokenizer tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words");
    //        Dataset<Row> wordsData = tokenizer.transform(sqlDF);
    //
    //        int numFeatures = 20;
    //        HashingTF hashingTF = new HashingTF()
    //                .setInputCol("words")
    //                .setOutputCol("rawFeatures")
    //                .setNumFeatures(numFeatures);
    //
    //        Dataset<Row> featurizedData = hashingTF.transform(wordsData);
    //
    //// alternatively, CountVectorizer can also be used to get term frequency vectors
    //        IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
    //        IDFModel idfModel = idf.fit(featurizedData);
    //        Dataset<Row> rows = idfModel.transform(featurizedData);
    //    
    //        rows.show();
    //        JavaRDD<LabeledPoint> data = rows.toJavaRDD().map(f -> new LabeledPoint(f.getString(0).equals("POSITIVE")?1:0,SparseVector.fromML( f.getAs(f.size() - 1))));
    HashingTF hashingTF = new HashingTF(1000);
    //usando idf eu acho que funciona
    //        JavaRDD<Vector> vetores = sqlDF.toJavaRDD().map(f -> hashingTF.transform(Arrays.asList(f.getString(1).split(" "))));
    //        IDFModel idf = new IDF().fit(vetores);
    //        JavaRDD<LabeledPoint> data = sqlDF.toJavaRDD().map(f -> new LabeledPoint(f.getAs(0).toString().equals("POSITIVE")?1:0, idf.transform(hashingTF.transform(Arrays.asList(f.getString(1).split(" "))))));
    //usando s hashingtf
    JavaRDD<LabeledPoint> data = sqlDF.toJavaRDD().map(new Function<Row, LabeledPoint>() {
        @Override
        public LabeledPoint call(Row f) throws Exception {
            String classifier = f.getString(0);
            String text = f.getString(1);
            text = URLRemove.remove(text);
            double cl = classifier.equals("POSITIVE") ? 1 : 0;
            return new LabeledPoint(cl, hashingTF.transform(
                    Arrays.asList(tokFactory.tokenizer(text.toCharArray(), 0, text.length()).tokenize()
            //                                text.split(" ");
            )));
        }
    });

    JavaRDD<LabeledPoint>[] tmp = data.randomSplit(new double[] { 0.6, 0.4 });
    JavaRDD<LabeledPoint> training = tmp[0]; // training set
    JavaRDD<LabeledPoint> test = tmp[1]; // test set
    final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
    JavaPairRDD<Double, Double> predictionAndLabel = test
            .mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
                @Override
                public Tuple2<Double, Double> call(LabeledPoint p) {
                    return new Tuple2<>(model.predict(p.features()), p.label());
                }
            });
    double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
        @Override
        public Boolean call(Tuple2<Double, Double> pl) {
            return pl._1().equals(pl._2());
        }
    }).count() / (double) test.count();
    spark.log().info("accuracy:" + accuracy);
    // Save and load model
    model.save(spark.sparkContext(), "Docker/myNaiveBayesModel");
    NaiveBayesModel sameModel = NaiveBayesModel.load(spark.sparkContext(), "Docker/myNaiveBayesModel");

}

From source file:NetworkWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaNetworkWordCount <hostname> <port>");
        System.exit(1);//w  w w. ja va  2  s  . c  om
    }

    SparkConf sparkConf = new SparkConf().setAppName("JavaNetworkWordCount");
    JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(3));

    JavaReceiverInputDStream<String> lines = ssc.socketTextStream(args[0], Integer.parseInt(args[1]),
            StorageLevels.MEMORY_AND_DISK_SER);

    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
        @Override
        public Iterator<String> call(String x) {
            return Arrays.asList(SPACE.split(x)).iterator();
        }
    });

    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<>(s, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });

    wordCounts.print();
    ssc.start();
    ssc.awaitTermination();
}

From source file:SparkKMer.java

License:Apache License

public static void main(String[] args) throws Exception {
    //Setup//from  ww w . ja v a  2 s  .co  m
    SparkConf sparkConf = new SparkConf().setAppName("SparkKMer");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    //Agrument parsing
    if (args.length < 2) {
        System.err.println("Usage: SparkKMer <accession> <kmer-length>");
        System.exit(1);
    }
    final String acc = args[0];
    final int KMER_LENGTH = Integer.parseInt(args[1]);

    //Check accession and split
    ReadCollection run = gov.nih.nlm.ncbi.ngs.NGS.openReadCollection(acc);
    long numreads = run.getReadCount();

    //Slice the job
    int chunk = 20000; /** amount of reads per 1 map operation **/
    int slices = (int) (numreads / chunk / 1);
    if (slices == 0)
        slices = 1;
    List<LongRange> sub = new ArrayList<LongRange>();
    for (long first = 1; first <= numreads;) {
        long last = first + chunk - 1;
        if (last > numreads)
            last = numreads;
        sub.add(new LongRange(first, last));
        first = last + 1;
    }
    System.err.println("Prepared ranges: \n" + sub);

    JavaRDD<LongRange> jobs = jsc.parallelize(sub, slices);
    //Map
    //
    JavaRDD<String> kmers = jobs.flatMap(new FlatMapFunction<LongRange, String>() {
        ReadCollection run = null;

        @Override
        public Iterable<String> call(LongRange s) {
            //Executes on task nodes
            List<String> ret = new ArrayList<String>();
            try {
                long first = s.getMinimumLong();
                long last = s.getMaximumLong();
                if (run == null) {
                    run = gov.nih.nlm.ncbi.ngs.NGS.openReadCollection(acc);
                }
                ReadIterator it = run.getReadRange(first, last - first + 1, Read.all);
                while (it.nextRead()) {
                    //iterate through fragments
                    while (it.nextFragment()) {
                        String bases = it.getFragmentBases();
                        //iterate through kmers
                        for (int i = 0; i < bases.length() - KMER_LENGTH; i++) {
                            ret.add(bases.substring(i, i + KMER_LENGTH));
                        }
                    }
                }
            } catch (ErrorMsg x) {
                System.err.println(x.toString());
                x.printStackTrace();
            }
            return ret;
        }
    });
    //Initiate kmer counting;
    JavaPairRDD<String, Integer> kmer_ones = kmers.mapToPair(new PairFunction<String, String, Integer>() {
        @Override
        public Tuple2<String, Integer> call(String s) {
            return new Tuple2<String, Integer>(s, 1);
        }
    });
    //Reduce counts
    JavaPairRDD<String, Integer> counts = kmer_ones.reduceByKey(new Function2<Integer, Integer, Integer>() {
        @Override
        public Integer call(Integer i1, Integer i2) {
            return i1 + i2;
        }
    });
    //Collect the output
    List<Tuple2<String, Integer>> output = counts.collect();
    for (Tuple2<String, Integer> tuple : output) {
        System.out.println(tuple._1() + ": " + tuple._2());
    }
    jsc.stop();
}