Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:org.springframework.data.hadoop.batch.spark.test.SparkHashtags.java

License:Apache License

public static void main(String[] args) {
    String fileName = "";
    if (args.length > 0) {
        fileName = args[0];/*from  www  . ja  va2s  .com*/
    }

    SparkConf conf = new SparkConf().setAppName("spark-hashtags");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> tweetData = sc.textFile(fileName).cache();

    JavaRDD<Map<String, Object>> tweets = tweetData.map(new Function<String, Map<String, Object>>() {
        public Map<String, Object> call(String s) throws Exception {
            return jsonMapper.readValue(s.toString(), new TypeReference<HashMap<String, Object>>() {
            });
        }
    });

    JavaPairRDD<String, Integer> hashTags = tweets
            .flatMapToPair(new PairFlatMapFunction<Map<String, Object>, String, Integer>() {
                public Iterable<Tuple2<String, Integer>> call(Map<String, Object> tweet) throws Exception {

                    Map<String, Object> entities = (Map<String, Object>) tweet.get("entities");
                    List<Map<String, Object>> hashTagEntries = null;
                    if (entities != null) {
                        hashTagEntries = (List<Map<String, Object>>) entities.get("hashtags");
                    }
                    List<Tuple2<String, Integer>> hashTags = new ArrayList<Tuple2<String, Integer>>();
                    if (hashTagEntries != null && hashTagEntries.size() > 0) {
                        for (Map<String, Object> hashTagEntry : hashTagEntries) {
                            String hashTag = hashTagEntry.get("text").toString();
                            hashTags.add(new Tuple2<String, Integer>(hashTag, 1));
                        }
                    }
                    return hashTags;
                }
            });

    JavaPairRDD<String, Integer> hashTagCounts = hashTags
            .reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer int1, Integer int2) throws Exception {
                    return int1 + int2;
                }
            });

    JavaPairRDD<String, Integer> hashTagCountsSorted = hashTagCounts
            .mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
                public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception {
                    return new Tuple2<Integer, String>(in._2(), in._1());
                }
            }).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
                public Tuple2<String, Integer> call(Tuple2<Integer, String> in) throws Exception {
                    return new Tuple2<String, Integer>(in._2(), in._1());
                }
            });

    List<Tuple2<String, Integer>> top10 = hashTagCountsSorted.take(10);
    System.out.println("HashTags: " + top10);

    JavaPairRDD<String, Integer> top10Hashtags = sc.parallelizePairs(top10);
    top10Hashtags.saveAsTextFile("hdfs:///test/spark/output");

    sc.stop();
}

From source file:PolygonMatching.MatchingGeoPolygon.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    String dataSource1 = args[0];
    String dataSource2 = args[1];
    final double thresholdLinguistic = Double.parseDouble(args[2]);
    final double thresholdPolygon = Double.parseDouble(args[3]);
    String outputPath = args[4];/*  w  w w. ja  v  a  2 s . c  o m*/
    Integer amountPartition = Integer.parseInt(args[5]);
    String sourceType = args[6];

    DataSource dataSourcePref = null;
    DataSource dataSourceOSM = null;
    if (sourceType.equals("CSV")) {
        dataSourcePref = AbstractExec.getDataCSV(dataSource1, ';');
        dataSourceOSM = AbstractExec.getDataCSV(dataSource2, ';');
    } else { //is postgis
        dataSourcePref = AbstractExec.getDataPostGres(dataSource1);
        dataSourceOSM = AbstractExec.getDataPostGres(dataSource2);
    }

    //      DataSource dataSourcePref = AbstractExec.getDataPostGres(dataSource1); //squaresOfCuritiba Pref
    //      DataSource dataSourceOSM = AbstractExec.getDataPostGres(dataSource2); //squaresOfCuritiba OSM

    //      DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt"); //squaresOfCuritiba Pref
    //      DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_curitiba.txt"); //squaresOfCuritiba OSM

    //      DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/parks_pref_ny.txt"); //parksOfNY Pref
    //      DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_ny.txt"); //parksOfNY OSM

    StorageManager storagePref = new StorageManager();
    StorageManager storageOSM = new StorageManager();

    // enables in-memory execution for faster processing
    // this can be done since the whole data fits into memory
    storagePref.enableInMemoryProcessing();
    storageOSM.enableInMemoryProcessing();

    // adds the "data" to the algorithm
    storagePref.addDataSource(dataSourcePref);
    storageOSM.addDataSource(dataSourceOSM);

    if (!storagePref.isDataExtracted()) {
        storagePref.extractData();
    }
    if (!storageOSM.isDataExtracted()) {
        storageOSM.extractData();
    }

    List<GeoPolygon> geoentitiesPref = new ArrayList<GeoPolygon>();
    List<GeoPolygon> geoentitiesOSM = new ArrayList<GeoPolygon>();

    // the algorithm returns each generated pair step-by-step
    int indexOfPref = 0;
    for (GenericObject genericObj : storagePref.getExtractedData()) {
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {//for curitiba use atribute "nome" for new york "signname"
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());//for curitiba use atribute "gid" for new york "id"
            geoentitiesPref.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.GOV_POLYGON, indexOfPref, id));
            indexOfPref++;
        }

    }

    int indexOfOSM = 0;
    for (GenericObject genericObj : storageOSM.getExtractedData()) {
        //               System.out.println(genericObj.getData().get("geometry"));
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());
            geoentitiesOSM.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.OSM_POLYGON, indexOfOSM, id));
            indexOfOSM++;
        }

    }

    JavaRDD<GeoPolygon> polygonsOSM = ctx.parallelize(geoentitiesOSM);
    JavaRDD<GeoPolygon> polygonsPref = ctx.parallelize(geoentitiesPref);

    JavaRDD<GeoPolygon> polygons = polygonsPref.union(polygonsOSM);

    final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
    JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons
            .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() {

                public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception {
                    List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>();
                    if (s.getType().equals(InputTypes.OSM_POLYGON)) {
                        listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(
                                s.getIdGeometry() % numReplication.getValue(), s));
                        return listOfPolygonTuple.iterator();
                    } else { //equals to InputTypes.GOV_POLYGON
                        for (int i = 0; i < numReplication.value(); i++) {
                            listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, s));
                        }
                        return listOfPolygonTuple.iterator();
                    }
                }

            });

    JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed
            .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() {

                public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception {
                    return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2());
                }
            });

    JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

    JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
            new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple)
                        throws Exception {
                    List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                    List<GeoPolygon> polygonsSource = new ArrayList<GeoPolygon>();
                    List<GeoPolygon> polygonsTarget = new ArrayList<GeoPolygon>();
                    for (GeoPolygon entity : polygonsPerKey) {
                        if (entity.getType() == InputTypes.OSM_POLYGON) {
                            polygonsSource.add(entity);
                        } else {
                            polygonsTarget.add(entity);
                        }
                    }

                    List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                    JaccardSimilarity jaccard = new JaccardSimilarity();
                    for (GeoPolygon entSource : polygonsSource) {
                        for (GeoPolygon entTarget : polygonsTarget) {
                            double linguisticSimilarity = 0.0;
                            //calculate the linguistic similarity
                            if (!entTarget.getGeoName().isEmpty()) {
                                linguisticSimilarity = jaccard.getSimilarity(
                                        entTarget.getGeoName().toLowerCase(),
                                        entSource.getGeoName().toLowerCase());
                            }

                            //calculate the polygon similarity
                            double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                            //classification of pairs
                            PolygonPair pair;
                            if (linguisticSimilarity > thresholdLinguistic
                                    && polygonSimilarity > thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.MATCH);
                            } else if (linguisticSimilarity < thresholdLinguistic
                                    && polygonSimilarity < thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.NON_MATCH);
                            } else {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                            }

                            //                  int index = entityMatches.size();
                            //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                            //for use case 04
                            if (pair.getPolygonClassification().equals(PolygonClassification.POSSIBLE_PROBLEM)
                                    || pair.getPolygonClassification().equals(PolygonClassification.MATCH)) {
                                int index = entityMatches.size();
                                entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                            }

                            //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                            //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                            ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                            ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                            ////                        System.out.println();
                            //                     }
                        }
                    }
                    return entityMatches.iterator();
                }
            });

    matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

        public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
            ArrayList<String> listOutput = new ArrayList<String>();
            listOutput.add(t._2().toStringCSV());
            return listOutput.iterator();
        }

    }).saveAsTextFile(outputPath);

    ctx.stop();
    ctx.close();
}

From source file:SingleMatchingGeoPolygon.SingleMatchingGeoPolygon.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    String dataSource = args[0];//www  .j  av a2 s  .  co m
    final double thresholdLinguistic = Double.parseDouble(args[1]);
    final double thresholdPolygon = Double.parseDouble(args[2]);
    String outputPath = args[3];
    Integer amountPartition = Integer.parseInt(args[4]);
    String sourceType = args[5];

    DataSource source1 = null;
    if (sourceType.equals("CSV")) {
        source1 = AbstractExec.getDataCSV(dataSource, ';');
    } else { //is postgis
        source1 = AbstractExec.getDataPostGres(dataSource);
    }

    ReadAbstractSource reader = new ReadAbstractSource();
    StorageManager storagePolygon = reader.readFile(source1);

    List<GeoPolygon> geoentities = new ArrayList<GeoPolygon>();

    int index = 0;
    for (GenericObject genericObj : storagePolygon.getExtractedData()) {
        //               System.out.println(genericObj.getData().get("geometry"));
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());
            geoentities.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.OSM_POLYGON, index, id));
            index++;
        }
    }

    JavaRDD<GeoPolygon> polygons = ctx.parallelize(geoentities);

    final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
    JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons
            .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() {

                public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception {
                    List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>();
                    GeoPolygon tocompare = s.getGeoPolygon();
                    tocompare.setDuplicated(false);
                    listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(
                            tocompare.getIdGeometry() % numReplication.getValue(), tocompare));//entity that not replicated

                    GeoPolygon duplicated = s.getGeoPolygon();
                    duplicated.setDuplicated(true);
                    for (int i = 0; i < numReplication.value(); i++) {//the entities that will be replicated
                        listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, duplicated));
                    }
                    return listOfPolygonTuple.iterator();
                }

            });

    JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed
            .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() {

                public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception {
                    return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2());
                }
            });

    JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

    JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
            new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple)
                        throws Exception {
                    List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                    List<GeoPolygon> polygonsToCompare = new ArrayList<GeoPolygon>();
                    List<GeoPolygon> polygonsDuplicated = new ArrayList<GeoPolygon>();
                    for (GeoPolygon entity : polygonsPerKey) {
                        if (entity.isDuplicated()) {
                            polygonsDuplicated.add(entity);
                        } else {
                            polygonsToCompare.add(entity);
                        }
                    }

                    List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                    JaccardSimilarity jaccard = new JaccardSimilarity();
                    for (GeoPolygon entSource : polygonsToCompare) {
                        for (GeoPolygon entTarget : polygonsDuplicated) {
                            double linguisticSimilarity = 0.0;
                            //calculate the linguistic similarity
                            if (!entTarget.getGeoName().isEmpty()) {
                                linguisticSimilarity = jaccard.getSimilarity(
                                        entTarget.getGeoName().toLowerCase(),
                                        entSource.getGeoName().toLowerCase());
                            }

                            //calculate the polygon similarity
                            double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                            //classification of pairs
                            PolygonPair pair;
                            if (linguisticSimilarity > thresholdLinguistic
                                    && polygonSimilarity > thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.MATCH);
                            } else if (linguisticSimilarity < thresholdLinguistic
                                    && polygonSimilarity < thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.NON_MATCH);
                            } else {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                            }

                            //                  int index = entityMatches.size();
                            //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                            //for use case 04
                            if (pair.getPolygonClassification().equals(PolygonClassification.MATCH) && (pair
                                    .getSource().getIdInDataset() != pair.getTarget().getIdInDataset())) {
                                int index = entityMatches.size();
                                entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                            }

                            //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                            //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                            ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                            ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                            ////                        System.out.println();
                            //                     }
                        }
                    }
                    return entityMatches.iterator();
                }
            });

    matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

        public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
            ArrayList<String> listOutput = new ArrayList<String>();
            listOutput.add(t._2().toStringCSV());
            return listOutput.iterator();
        }

    }).saveAsTextFile(outputPath);

    ctx.stop();
    ctx.close();
}

From source file:SingleMatchingGeoPolygon.SingleMatchingGeoPolygonBlocked.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    String dataSource = args[0];//from   w  w w  . ja  va  2 s  .c  o m
    final double thresholdLinguistic = Double.parseDouble(args[1]);
    final double thresholdPolygon = Double.parseDouble(args[2]);
    String outputPath = args[3];
    Integer amountPartition = Integer.parseInt(args[4]);
    String sourceType = args[5];

    DataSource source1 = null;
    if (sourceType.equals("CSV")) {
        source1 = AbstractExec.getDataCSV(dataSource, ';');
    } else { //is postgis
        source1 = AbstractExec.getDataPostGres(dataSource);
    }

    ReadAbstractSource reader = new ReadAbstractSource();
    StorageManager storagePolygon = reader.readFile(source1);
    //      StorageManager storagePolygon = reader.readFile(AbstractExec.getDataPostGres("queries/osm_curitiba.txt"));
    //      StorageManager storagePolygon = reader.readFile(AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt"));

    List<GeoPolygon> geoentities = new ArrayList<GeoPolygon>();

    int index = 0;
    for (GenericObject genericObj : storagePolygon.getExtractedData()) {
        //               System.out.println(genericObj.getData().get("geometry"));
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());
            geoentities.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.OSM_POLYGON, index, id));
            index++;
        }
    }

    JavaRDD<GeoPolygon> polygons = ctx.parallelize(geoentities);

    Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
    JavaRDD<Tuple2<String, GeoPolygon>> polygonLabed = polygons
            .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<String, GeoPolygon>>() {

                public Iterator<Tuple2<String, GeoPolygon>> call(GeoPolygon s) throws Exception {
                    List<Tuple2<String, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<String, GeoPolygon>>();
                    GeoPolygon tocompare = s.getGeoPolygon();
                    tocompare.setDuplicated(false);
                    if (tocompare.getGeoName().length() < 3) {
                        listOfPolygonTuple
                                .add(new Tuple2<String, GeoPolygon>(tocompare.getGeoName(), tocompare));//entity that not replicated
                    } else {
                        listOfPolygonTuple.add(new Tuple2<String, GeoPolygon>(
                                tocompare.getGeoName().substring(0, 3), tocompare));//entity that not replicated
                    }

                    GeoPolygon duplicated = s.getGeoPolygon();
                    duplicated.setDuplicated(true);
                    if (duplicated.getGeoName().length() < 3) {
                        listOfPolygonTuple
                                .add(new Tuple2<String, GeoPolygon>(duplicated.getGeoName(), duplicated));
                    } else {
                        listOfPolygonTuple.add(new Tuple2<String, GeoPolygon>(
                                duplicated.getGeoName().substring(0, 3), duplicated));
                    }

                    //            for (int i = 0; i < numReplication.value(); i++) {//the entities that will be replicated
                    //               listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(duplicated, duplicated));
                    //            }
                    return listOfPolygonTuple.iterator();
                }

            });

    JavaPairRDD<String, GeoPolygon> polygonsPaired = polygonLabed
            .mapToPair(new PairFunction<Tuple2<String, GeoPolygon>, String, GeoPolygon>() {

                public Tuple2<String, GeoPolygon> call(Tuple2<String, GeoPolygon> tuple) throws Exception {
                    return new Tuple2<String, GeoPolygon>(tuple._1(), tuple._2());
                }
            });

    JavaPairRDD<String, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

    JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
            new PairFlatMapFunction<Tuple2<String, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<String, Iterable<GeoPolygon>> tuple)
                        throws Exception {
                    List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                    List<GeoPolygon> polygonsToCompare = new ArrayList<GeoPolygon>();
                    List<GeoPolygon> polygonsDuplicated = new ArrayList<GeoPolygon>();
                    for (GeoPolygon entity : polygonsPerKey) {
                        if (entity.isDuplicated()) {
                            polygonsDuplicated.add(entity);
                        } else {
                            polygonsToCompare.add(entity);
                        }
                    }

                    List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                    JaccardSimilarity jaccard = new JaccardSimilarity();
                    for (GeoPolygon entSource : polygonsToCompare) {
                        for (GeoPolygon entTarget : polygonsDuplicated) {
                            double linguisticSimilarity = 0.0;
                            //calculate the linguistic similarity
                            if (!entTarget.getGeoName().isEmpty()) {
                                linguisticSimilarity = jaccard.getSimilarity(
                                        entTarget.getGeoName().toLowerCase(),
                                        entSource.getGeoName().toLowerCase());
                            }

                            //calculate the polygon similarity
                            double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                            //classification of pairs
                            PolygonPair pair;
                            if (linguisticSimilarity > thresholdLinguistic
                                    && polygonSimilarity > thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.MATCH);
                            } else if (linguisticSimilarity < thresholdLinguistic
                                    && polygonSimilarity < thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.NON_MATCH);
                            } else {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                            }

                            //                  int index = entityMatches.size();
                            //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                            //for use case 04
                            if (pair.getPolygonClassification().equals(PolygonClassification.MATCH) && (pair
                                    .getSource().getIdInDataset() != pair.getTarget().getIdInDataset())) {
                                int index = entityMatches.size();
                                entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                            }

                            //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                            //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                            ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                            ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                            ////                        System.out.println();
                            //                     }
                        }
                    }
                    return entityMatches.iterator();
                }
            });

    matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

        public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
            ArrayList<String> listOutput = new ArrayList<String>();
            listOutput.add(t._2().toStringCSV());
            return listOutput.iterator();
        }

    }).saveAsTextFile(outputPath);

    ctx.stop();
    ctx.close();
}

From source file:SparkExamples.SparkPR.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/* www  .  ja v a 2  s.  c o m*/
    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("SparkPR");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    long start = System.currentTimeMillis();

    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    int partition = Integer.parseInt(args[3]);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            parts[0] = parts[0].toString()
                    .replaceAll("AAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZ", "");
            parts[1] = parts[1].toString()
                    .replaceAll("AAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZ", "");
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).groupByKey(partition).persist(StorageLevel.MEMORY_AND_DISK());

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    ranks.foreach(p -> System.out.println(p));

    long end = System.currentTimeMillis();
    System.out.println("running time " + (end - start) / 1000 + "s");

    String results = "running time " + (new Double((end - start) / 1000)).toString() + "s" + "input file : "
            + args[0] + ", iteration : " + args[1];
    System.out.println(results);
    String outputurl = ".//results.txt";
    BufferedWriter writer = new BufferedWriter(new FileWriter(outputurl, true));
    writer.write(results);
    writer.newLine();
    writer.close();

    ctx.stop();
}

From source file:streaming.NginxlogSorter.java

License:Apache License

public static void main(String[] args) {
    JavaSparkContext sc = null;//from  ww w. ja va  2s.  c  om
    try {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("NginxlogSorter");
        //   conf.set("hadoop.home.dir", "/usr/local/hadoop/hadoop-2.6.0");

        sc = new JavaSparkContext(conf);

        JavaRDD<String> lines = sc.textFile("src/test/resources/nginx_report.txt");

        lines.map(new Function<String, String>() {

            @Override
            public String call(String s) throws Exception {
                log.info(s);
                return null;
            }

        });

        JavaPairRDD<String, Integer> items = lines.mapToPair(new PairFunction<String, String, Integer>() {

            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                log.info(s);
                return null;
            }

        });

        lines.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {

            @Override
            public Iterable<Tuple2<String, Integer>> call(String t) throws Exception {

                log.info(">>>: {}", t);
                return null;
            }

        });
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        sc.close();
    }
}

From source file:univ.bigdata.course.pagerank.JavaPageRank.java

License:Apache License

public static void calculatePageRank(JavaRDD<String> rdd, int iteration_number) throws Exception {
    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     .../*w w  w .  j  a va2s  .co m*/
    JavaRDD<String> lines = rdd;

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < iteration_number; current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<>();
                        for (String n : s._1) {
                            results.add(new Tuple2<>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    JavaPairRDD<Double, String> sortedRanks = ranks
            .mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() {

                @Override
                public Tuple2<Double, String> call(Tuple2<String, Double> t) {
                    return new Tuple2<Double, String>(t._2, t._1);
                }
            }).sortByKey(false);

    // Collects all URL ranks and dump them to console.
    List<Tuple2<Double, String>> output = sortedRanks.takeOrdered(100, PageRankComperator.VALUE_COMP);
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._2() + " has rank: " + tuple._1() + ".");
    }
}

From source file:weka.distributed.spark.CorrelationMatrixSparkJob.java

License:Open Source License

/**
 * Build the correlation matrix and write it to the output destination
 * /*from  w  w w  .  j a  v  a  2 s  .  c o  m*/
 * @param dataset the input RDD dataset to use
 * @param headerWithSummary the header of the data (with summary attributes)
 * @param outputPath the path to write results to
 * @throws Exception if a problem occurs
 */
protected void buildMatrix(JavaRDD<Instance> dataset, final Instances headerWithSummary, String outputPath)
        throws Exception {

    String matrixMapOpts = getCorrelationMapTaskOptions();
    String[] mapOpts = null;
    if (!DistributedJobConfig.isEmpty(matrixMapOpts)) {
        mapOpts = Utils.splitOptions(environmentSubstitute(matrixMapOpts));
    }
    final String[] fMapOpts = mapOpts;

    // construct a temporary map task in order to determine how
    // many rows there will be in the matrix (after deleting any
    // nominal atts and potentially the class att)
    CorrelationMatrixMapTask tempTask = new CorrelationMatrixMapTask();
    if (fMapOpts != null) {
        tempTask.setOptions(fMapOpts.clone());
    }
    final boolean missingReplacedWithMean = !tempTask.getIgnoreMissingValues();
    final boolean covarianceInsteadOfCorrelation = tempTask.getCovariance();
    final boolean deleteClassIfSet = !tempTask.getKeepClassAttributeIfSet();

    tempTask.setup(headerWithSummary);
    final int numRowsInMatrix = tempTask.getMatrix().length;

    JavaPairRDD<Integer, MatrixRowHolder> mapToPartialRows = dataset
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, MatrixRowHolder>() {

                /** For serialization */
                private static final long serialVersionUID = -3024936415666668127L;

                protected List<Tuple2<Integer, MatrixRowHolder>> m_partialRows = new ArrayList<Tuple2<Integer, MatrixRowHolder>>();

                @Override
                public Iterable<Tuple2<Integer, MatrixRowHolder>> call(Iterator<Instance> split)
                        throws DistributedWekaException {

                    CorrelationMatrixMapTask task = new CorrelationMatrixMapTask();
                    try {
                        if (fMapOpts != null) {
                            task.setOptions(fMapOpts);
                        }
                        task.setup(headerWithSummary);

                        while (split.hasNext()) {
                            task.processInstance(split.next());
                        }

                        // output all the rows in this partial matrix
                        double[][] partialMatrix = task.getMatrix();
                        int[][] coOcc = task.getCoOccurrenceCounts();
                        for (int i = 0; i < partialMatrix.length; i++) {
                            double[] row = partialMatrix[i];
                            int[] co = null;
                            if (coOcc != null) {
                                co = coOcc[i];
                            }
                            MatrixRowHolder rh = new MatrixRowHolder(i, row, co);
                            m_partialRows.add(new Tuple2<Integer, MatrixRowHolder>(i, rh));
                        }
                    } catch (Exception ex) {
                        throw new DistributedWekaException(ex);
                    }

                    return m_partialRows;
                }
            }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRowsInMatrix))
            .persist(getCachingStrategy().getStorageLevel());

    JavaPairRDD<Integer, double[]> aggregatedRows = mapToPartialRows.mapPartitionsToPair(
            new PairFlatMapFunction<Iterator<Tuple2<Integer, MatrixRowHolder>>, Integer, double[]>() {

                /** For serialization */
                private static final long serialVersionUID = -1290972198473290092L;
                protected List<Tuple2<Integer, double[]>> result = new ArrayList<Tuple2<Integer, double[]>>();

                @Override
                public Iterable<Tuple2<Integer, double[]>> call(
                        Iterator<Tuple2<Integer, MatrixRowHolder>> split) throws DistributedWekaException {

                    List<double[]> partials = new ArrayList<double[]>();
                    List<int[]> partialCoOcc = new ArrayList<int[]>();
                    int rowNum = -1;
                    while (split.hasNext()) {
                        Tuple2<Integer, MatrixRowHolder> nextRow = split.next();
                        if (rowNum < 0) {
                            rowNum = nextRow._2().getRowNumber();
                        } else {
                            if (nextRow._2().getRowNumber() != rowNum) {
                                throw new DistributedWekaException("Was not expecting the matrix row number "
                                        + "to change within a partition!");
                            }

                            partials.add(nextRow._2().getRow());
                            if (!missingReplacedWithMean) {
                                partialCoOcc.add(nextRow._2().getCoOccurrencesCounts());
                            }
                        }
                    }

                    if (partials.size() > 0) {
                        CorrelationMatrixRowReduceTask reducer = new CorrelationMatrixRowReduceTask();

                        double[] aggregated = reducer.aggregate(rowNum, partials, partialCoOcc,
                                headerWithSummary, missingReplacedWithMean, covarianceInsteadOfCorrelation,
                                deleteClassIfSet);
                        result.add(new Tuple2<Integer, double[]>(rowNum, aggregated));
                    }

                    return result;
                }
            });

    List<Tuple2<Integer, double[]>> reducedRows = aggregatedRows.collect();
    mapToPartialRows.unpersist();

    double[][] m = new double[reducedRows.size()][reducedRows.size()];
    for (Tuple2<Integer, double[]> row : reducedRows) {
        int i = row._1();
        double[] js = row._2();

        for (int j = 0; j < js.length; j++) {
            m[i][j] = js[j];
            m[j][i] = js[j];
        }
    }

    m_finalMatrix = new weka.core.matrix.Matrix(m);

    Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);
    try {
        writeMatrixToOutput(outputPath, reducedRows, headerNoSummary, deleteClassIfSet);
    } catch (Exception ex) {
        logMessage(ex);
        throw new DistributedWekaException(ex);
    }

    if (getRunPCA()) {
        runPCA(outputPath, covarianceInsteadOfCorrelation, !deleteClassIfSet, headerWithSummary,
                headerNoSummary);
    }
}

From source file:weka.distributed.spark.KMeansClustererSparkJob.java

License:Open Source License

/**
 * Perform an iteration of k-means/*from   w  w  w. j a  v  a  2 s  .  co  m*/
 *
 * @param dataset the dataset to operate on
 * @param mapTasks the underlying map tasks to use - one for each separate run
 *          of k-means that we're doing in parallel
 * @param converged array indicating which runs have converged
 * @param iterationNum the iteration number that we're up to
 * @param transformedHeaderNoSummary the header of the training data (sans
 *          summary attributes)
 * @return a list of KMeansReduceTasks encapsulating the results of the
 *         iteration for each active run of k-means
 * @throws DistributedWekaException if a problem occurs
 */
protected List<Tuple2<Integer, KMeansReduceTask>> performKMeansIteration(JavaRDD<Instance> dataset,
        final KMeansMapTask[] mapTasks, final boolean[] converged, final int iterationNum,
        final Instances transformedHeaderNoSummary) throws DistributedWekaException {

    final int numRuns = mapTasks.length;

    // keyed by run, a list of partial centroid summary instances
    // - one Instances object for each centroid (may be null if a
    // given centroid did not get any instances assigned to it)
    JavaPairRDD<Integer, List<Instances>> mapRuns = dataset
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, List<Instances>>() {

                /**
                 * For serialization
                 */
                private static final long serialVersionUID = 6063661312796545915L;

                protected List<Tuple2<Integer, List<Instances>>> m_centroidStatsForRuns = new ArrayList<Tuple2<Integer, List<Instances>>>();

                @Override
                public Iterable<Tuple2<Integer, List<Instances>>> call(Iterator<Instance> split)
                        throws DistributedWekaException {

                    while (split.hasNext()) {
                        Instance current = split.next();

                        for (int k = 0; k < numRuns; k++) {
                            if (!converged[k]) {
                                mapTasks[k].processInstance(current);
                            }
                        }
                    }

                    for (int k = 0; k < numRuns; k++) {
                        if (!converged[k]) {
                            List<Instances> centroidStatsForRun = mapTasks[k].getCentroidStats();
                            m_centroidStatsForRuns
                                    .add(new Tuple2<Integer, List<Instances>>(k, centroidStatsForRun));
                        }
                    }

                    return m_centroidStatsForRuns;
                }
            }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns))
            .persist(StorageLevel.MEMORY_AND_DISK());

    mapRuns.count();

    // Reduce. Need to aggregate all the cluster stats
    // for each run. Do we repartition into numRuns partitions and then
    // run another mapPartitions phase? With our custom partitioner this
    // should guarantee that a partition only contains the lists of instances
    // for one run. Can't use partitionByKey because CSVReduce is not
    // associative, and needs to see the whole list of summary instances
    // objects for one run, cluster# (need to run a separate reduce for
    // each cluster centroid within each run anyway). Then update the
    // final error for each centroid in each run and the total error
    // (sum of errors over centroids for a run)
    JavaPairRDD<Integer, KMeansReduceTask> reducedByRun = mapRuns.mapPartitionsToPair(
            new PairFlatMapFunction<Iterator<Tuple2<Integer, List<Instances>>>, Integer, KMeansReduceTask>() {

                /**
                 * For serialization
                 */
                private static final long serialVersionUID = -747645603149767637L;

                protected List<Tuple2<Integer, KMeansReduceTask>> m_resultsForRun = new ArrayList<Tuple2<Integer, KMeansReduceTask>>();

                @Override
                public Iterable<Tuple2<Integer, KMeansReduceTask>> call(
                        Iterator<Tuple2<Integer, List<Instances>>> split) throws DistributedWekaException {

                    List<List<Instances>> partialsForRun = new ArrayList<List<Instances>>();
                    int runNumber = -1;

                    while (split.hasNext()) {
                        Tuple2<Integer, List<Instances>> partial = split.next();
                        if (runNumber < 0) {
                            runNumber = partial._1().intValue();
                        } else {
                            if (partial._1().intValue() != runNumber) {
                                throw new DistributedWekaException("[k-means] reduce phase: "
                                        + "was not expecting the run number to change within a "
                                        + "partition!");
                            }
                        }

                        partialsForRun.add(partial._2());
                    }

                    KMeansReduceTask reducer = new KMeansReduceTask();

                    // size might be zero if we are operating on a partition for a
                    // run that has already converged (in which case there will be no
                    // data in this partition)...
                    if (partialsForRun.size() > 0) {
                        reducer.reduceClusters(runNumber, iterationNum, transformedHeaderNoSummary,
                                partialsForRun);
                        m_resultsForRun.add(new Tuple2<Integer, KMeansReduceTask>(runNumber, reducer));
                    }

                    return m_resultsForRun;
                }
            });

    List<Tuple2<Integer, KMeansReduceTask>> runResults = reducedByRun.collect();
    mapRuns.unpersist();
    reducedByRun.unpersist();

    return runResults;
}

From source file:weka.distributed.spark.KMeansClustererSparkJob.java

License:Open Source License

/**
 * Perform the k-means|| initialization process
 *
 * @param dataset the dataset to operate on
 * @param headerWithSummary the header of the data, with summary attributes
 * @param numRuns the number of separate runs of k-means to be performed in
 *          parallel//from  w w  w  .  j  a  v  a 2 s . c om
 * @param numClusters the number of clusters to generate
 * @return a list of Instances objects, where each Instances object contains
 *         the starting points for one run of k-means
 * @throws IOException if a problem occurs
 * @throws DistributedWekaException if a problem occurs
 */
protected List<Instances> initializeWithKMeansParallel(JavaRDD<Instance> dataset, Instances headerWithSummary,
        final int numRuns, int numClusters) throws IOException, DistributedWekaException {

    int numSteps = Integer.parseInt(environmentSubstitute(getKMeansParallelInitSteps()));

    // random seed option
    int randomSeed = 1;
    if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
        try {
            randomSeed = Integer.parseInt(environmentSubstitute(getRandomSeed()));
        } catch (NumberFormatException ex) {
            // don't fuss
        }
    }

    // 1) start with 1 randomly chosen point for each run
    // 2) run sketch for x iterations (aggregating reservoirs for each
    // run at the end of each iteration (i.e. reservoirs for run 1
    // on each split of the data, reservoirs for run 2, etc.)
    // 3) Get final sketch for each run
    // 4) Weight each point in each sketch by the number of points
    // in the data that cluster to it
    // 5) Run local KMeans on data weighted data to obtain final k
    // starting centers

    // Step 1: start with 1 randomly chosen point for each run
    List<Instances> randomSingleCenters = initializeWithRandomCenters(dataset, headerWithSummary, numRuns, 1);

    // Step 2: run sketch for x iterations (aggregating reservoirs for each
    // run at the end of each iteration (i.e. reservoirs for run 1
    // on each split of the data, reservoirs for run 2, etc.)
    Instances tmpTrans = null;
    // one configured task per run (we'll use this for an initial distance
    // function and for step 4 where we need to cluster all the points to
    // get cluster sizes
    final KMeansMapTask[] mapTasks = new KMeansMapTask[numRuns];
    for (int i = 0; i < numRuns; i++) {
        mapTasks[i] = new KMeansMapTask();

        try {
            mapTasks[i].setOptions(Utils.splitOptions(getKMeansMapTaskOpts()));
        } catch (Exception e) {
            throw new DistributedWekaException(e);
        }
        tmpTrans = mapTasks[i].init(headerWithSummary);
    }

    // transformed header (has passed through filters)
    final Instances transformedHeaderNoSummary = tmpTrans;

    NormalizableDistance distanceFunc = mapTasks[0].getDistanceFunction();
    final CentroidSketch[] sketches = new CentroidSketch[numRuns];
    // initialize sketches
    for (int i = 0; i < numRuns; i++) {
        try {
            // apply any filters
            Instances transformedStartSketch = randomSingleCenters.get(i);
            // mapTasks[0].applyFilters(randomSingleCenters.get(i));

            sketches[i] = new CentroidSketch(transformedStartSketch, distanceFunc, 2 * numClusters,
                    randomSeed + i);
        } catch (Exception ex) {
            logMessage(ex);
            throw new DistributedWekaException(ex);
        }
    }

    // this is used when processing instances in partitions to
    // ensure that each instance from the data set gets
    // filtered appropriately
    final KMeansMapTask forFilteringOnly = mapTasks[0];

    for (int i = 0; i < numSteps; i++) {
        logMessage("[k-means] Running iteration " + (i + 1) + " of k-means|| initialization procedure.");
        final int iterationNum = i;

        // keyed by run, a list of partial sketches
        // - one CentroidSketch object for each run in each partition
        JavaPairRDD<Integer, CentroidSketch> mapRuns = dataset
                .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, CentroidSketch>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = 6063661312796545915L;

                    protected List<Tuple2<Integer, CentroidSketch>> m_centroidSketchesForRuns = new ArrayList<Tuple2<Integer, CentroidSketch>>();

                    @Override
                    public Iterable<Tuple2<Integer, CentroidSketch>> call(Iterator<Instance> split)
                            throws DistributedWekaException {

                        while (split.hasNext()) {
                            Instance current = split.next();
                            try {
                                // make sure it goes through any filters first!
                                current = forFilteringOnly.applyFilters(current);
                            } catch (Exception ex) {
                                throw new DistributedWekaException(ex);
                            }

                            for (int k = 0; k < numRuns; k++) {
                                sketches[k].process(current, iterationNum == 0);
                            }
                        }

                        for (int k = 0; k < numRuns; k++) {
                            m_centroidSketchesForRuns.add(new Tuple2<Integer, CentroidSketch>(k, sketches[k]));

                        }

                        return m_centroidSketchesForRuns;
                    }
                }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns))
                .persist(StorageLevel.MEMORY_AND_DISK());

        mapRuns.count();

        // Each partion of mapRuns now contains partials for just one run.
        // Here we aggregate the partials per run
        JavaPairRDD<Integer, CentroidSketch> reducedByRun = mapRuns.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, CentroidSketch>>, Integer, CentroidSketch>() {

                    /** For serialization */
                    private static final long serialVersionUID = 7689178383188695493L;

                    protected List<Tuple2<Integer, CentroidSketch>> m_resultsForRun = new ArrayList<Tuple2<Integer, CentroidSketch>>();

                    @Override
                    public Iterable<Tuple2<Integer, CentroidSketch>> call(
                            Iterator<Tuple2<Integer, CentroidSketch>> split) throws DistributedWekaException {

                        int runNumber = -1;
                        CentroidSketch initial = null;
                        List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>();

                        while (split.hasNext()) {
                            Tuple2<Integer, CentroidSketch> partial = split.next();
                            if (runNumber < 0) {
                                runNumber = partial._1().intValue();
                            } else {
                                if (partial._1().intValue() != runNumber) {
                                    throw new DistributedWekaException("[k-means] k-means|| initialization: "
                                            + "was not expecting the run number to change within "
                                            + "a partition!");
                                }
                            }

                            if (initial == null) {
                                initial = partial._2();
                            } else {
                                try {
                                    initial.aggregateReservoir(partial._2().getReservoirSample());
                                } catch (Exception e) {
                                    throw new DistributedWekaException(e);
                                }
                            }

                            // get all the distance functions and
                            // compute priming data that has global
                            // min and maxes.
                            if (iterationNum == 0) {
                                // only need to determine global distance function
                                // priming data once (i.e. in the first iteration of
                                // the k-means|| process)
                                distsForRun.add(partial._2().getDistanceFunction());
                            }
                        }

                        // update the distance function with global numeric
                        // attribute ranges
                        if (distsForRun.size() > 0) {
                            Instances distancePrimingData = KMeansReduceTask
                                    .computeDistancePrimingDataFromDistanceFunctions(distsForRun,
                                            transformedHeaderNoSummary);
                            initial.getDistanceFunction().setInstances(distancePrimingData);
                        }

                        m_resultsForRun.add(new Tuple2<Integer, CentroidSketch>(runNumber, initial));

                        return m_resultsForRun;
                    }
                });

        List<Tuple2<Integer, CentroidSketch>> runResults = reducedByRun.collect();
        mapRuns.unpersist();
        mapRuns = null;

        for (Tuple2<Integer, CentroidSketch> r : runResults) {
            int runNum = r._1().intValue();
            sketches[runNum] = r._2();

            // add the current contents of the reservoir to the sketch
            // for each run
            try {
                sketches[runNum].addReservoirToCurrentSketch();

                if (m_debug) {
                    logMessage("[k-means] Iteration: " + i + " - number of instances in sketch: "
                            + sketches[runNum].getCurrentSketch().numInstances() + "\n"
                            + sketches[runNum].getCurrentSketch());
                }
            } catch (Exception ex) {
                logMessage(ex);
                throw new DistributedWekaException(ex);
            }
        }
        reducedByRun.unpersist();
    }

    // perform and aggregate clustering using the final sketch results
    // so that we can find out how many points are assigned to
    // each instance in the sketch.
    Instances globalPriming = sketches[0].getDistanceFunction().getInstances();
    if (globalPriming.numInstances() != 2) {
        logMessage("[k-means] Error: as expecting a two instance "
                + "(global priming data) dataset to be set in the distance function " + "in each sketch!");
        throw new DistributedWekaException("Was expecting a two instance (global priming data)"
                + " dataset to be set in the distance function in each sketch!");
    }
    for (int i = 0; i < numRuns; i++) {
        // set sketches as centers for map tasks
        // in preparation for clustering (so that we can)
        // find out how many training points get assigned to
        // each center

        mapTasks[i].setCentroids(sketches[i].getCurrentSketch());
        mapTasks[i].setDummyDistancePrimingData(globalPriming);
    }

    // 3 & 4) Get final sketch for each run and weight each point in
    // the sketch by the number of training instances that cluster to it
    List<Tuple2<Integer, KMeansReduceTask>> clusterAssignments = performKMeansIteration(dataset, mapTasks,
            new boolean[numRuns], 1, transformedHeaderNoSummary);

    List<Instances> finalStartPointsForRuns = new ArrayList<Instances>();
    for (int i = 0; i < numRuns; i++) {
        int rN = clusterAssignments.get(i)._1().intValue();
        List<Instances> centroidSummaries = clusterAssignments.get(i)._2().getAggregatedCentroidSummaries();

        Instances sketchForRun = sketches[i].getCurrentSketch();

        // empty clusters shouldn't be a problem - in
        // one iteration each sketch member should at minimum
        // have itself assigned (i.e. count >= 1). NOTE: The only exception
        // could occur if the sketch contains duplicate instances. However,
        // this shouldn't happen within a single WeightedReservoirSampling
        // as candidate instances with weight 0 (i.e. distance 0 to the sketch
        // in this case) are never added to the sketch.
        if (centroidSummaries.size() != sketchForRun.numInstances()) {
            logMessage("[k-means] Error: was expecting as " + "many summary headers as \n"
                    + "there are center candidates in the sketch for run " + rN);
            throw new DistributedWekaException("Was expecting as many summary headers as "
                    + "there are center candidates in the sketch for run " + rN);
        }

        for (int j = 0; j < sketchForRun.numInstances(); j++) {
            Instance centerCandidate = sketchForRun.instance(j);
            Instances centerStats = centroidSummaries.get(j);
            double weightForCandidate = -1.0;
            // now grab the first summary attribute and get count
            for (int k = 0; k < sketchForRun.numAttributes(); k++) {

                if (sketchForRun.attribute(k).isNumeric()) {
                    Attribute statsAtt = centerStats
                            .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                                    + sketchForRun.attribute(k).name());
                    weightForCandidate = ArffSummaryNumericMetric.COUNT.valueFromAttribute(statsAtt)
                            + ArffSummaryNumericMetric.MISSING.valueFromAttribute(statsAtt);
                    break;
                } else if (sketchForRun.attribute(k).isNominal()) {
                    Attribute statsAtt = centerStats
                            .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                                    + sketchForRun.attribute(k).name());
                    NominalStats ns = NominalStats.attributeToStats(statsAtt);
                    weightForCandidate = 0;
                    for (String s : ns.getLabels()) {
                        weightForCandidate += ns.getCount(s);
                    }
                    weightForCandidate += ns.getNumMissing();
                }
            }

            if (weightForCandidate < 0) {
                logMessage("[k-means] Error: unable to compute the " + "number of training instances "
                        + "assigned to sketch member " + j + " in run " + i);
                throw new DistributedWekaException("Unable to compute the number of training instances "
                        + "assigned to sketch member " + j + " in run " + i);
            }

            // finally - set the weight
            centerCandidate.setWeight(weightForCandidate);
        }

        if (m_debug) {
            logMessage("Final weighted sketch (run " + i + ") prior to local KMeans:\n" + sketchForRun);
        }

        // now run standard k-means on the weighted sketch to
        // (hopefully) get the requested number of start points
        SimpleKMeans localKMeans = new SimpleKMeans();
        try {
            localKMeans.setNumClusters(numClusters);
            localKMeans.setInitializationMethod(
                    new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
            localKMeans.buildClusterer(sketchForRun);
            finalStartPointsForRuns.add(localKMeans.getClusterCentroids());
        } catch (Exception ex) {
            logMessage(ex);
            throw new DistributedWekaException(ex);
        }
    }

    m_distanceFunctionPrimingData = globalPriming;

    return finalStartPointsForRuns;
}