Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:org.springframework.data.hadoop.batch.spark.test.SparkHashtags.java

License:Apache License

public static void main(String[] args) {
    String fileName = "";
    if (args.length > 0) {
        fileName = args[0];/*from  www  . ja  va2s  .com*/
    }

    SparkConf conf = new SparkConf().setAppName("spark-hashtags");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> tweetData = sc.textFile(fileName).cache();

    JavaRDD<Map<String, Object>> tweets = tweetData.map(new Function<String, Map<String, Object>>() {
        public Map<String, Object> call(String s) throws Exception {
            return jsonMapper.readValue(s.toString(), new TypeReference<HashMap<String, Object>>() {
            });
        }
    });

    JavaPairRDD<String, Integer> hashTags = tweets
            .flatMapToPair(new PairFlatMapFunction<Map<String, Object>, String, Integer>() {
                public Iterable<Tuple2<String, Integer>> call(Map<String, Object> tweet) throws Exception {

                    Map<String, Object> entities = (Map<String, Object>) tweet.get("entities");
                    List<Map<String, Object>> hashTagEntries = null;
                    if (entities != null) {
                        hashTagEntries = (List<Map<String, Object>>) entities.get("hashtags");
                    }
                    List<Tuple2<String, Integer>> hashTags = new ArrayList<Tuple2<String, Integer>>();
                    if (hashTagEntries != null && hashTagEntries.size() > 0) {
                        for (Map<String, Object> hashTagEntry : hashTagEntries) {
                            String hashTag = hashTagEntry.get("text").toString();
                            hashTags.add(new Tuple2<String, Integer>(hashTag, 1));
                        }
                    }
                    return hashTags;
                }
            });

    JavaPairRDD<String, Integer> hashTagCounts = hashTags
            .reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer int1, Integer int2) throws Exception {
                    return int1 + int2;
                }
            });

    JavaPairRDD<String, Integer> hashTagCountsSorted = hashTagCounts
            .mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
                public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception {
                    return new Tuple2<Integer, String>(in._2(), in._1());
                }
            }).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
                public Tuple2<String, Integer> call(Tuple2<Integer, String> in) throws Exception {
                    return new Tuple2<String, Integer>(in._2(), in._1());
                }
            });

    List<Tuple2<String, Integer>> top10 = hashTagCountsSorted.take(10);
    System.out.println("HashTags: " + top10);

    JavaPairRDD<String, Integer> top10Hashtags = sc.parallelizePairs(top10);
    top10Hashtags.saveAsTextFile("hdfs:///test/spark/output");

    sc.stop();
}

From source file:PolygonMatching.MatchingGeoPolygon.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    String dataSource1 = args[0];
    String dataSource2 = args[1];
    final double thresholdLinguistic = Double.parseDouble(args[2]);
    final double thresholdPolygon = Double.parseDouble(args[3]);
    String outputPath = args[4];/*  w  w w. ja  v  a  2 s . c  o m*/
    Integer amountPartition = Integer.parseInt(args[5]);
    String sourceType = args[6];

    DataSource dataSourcePref = null;
    DataSource dataSourceOSM = null;
    if (sourceType.equals("CSV")) {
        dataSourcePref = AbstractExec.getDataCSV(dataSource1, ';');
        dataSourceOSM = AbstractExec.getDataCSV(dataSource2, ';');
    } else { //is postgis
        dataSourcePref = AbstractExec.getDataPostGres(dataSource1);
        dataSourceOSM = AbstractExec.getDataPostGres(dataSource2);
    }

    //      DataSource dataSourcePref = AbstractExec.getDataPostGres(dataSource1); //squaresOfCuritiba Pref
    //      DataSource dataSourceOSM = AbstractExec.getDataPostGres(dataSource2); //squaresOfCuritiba OSM

    //      DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt"); //squaresOfCuritiba Pref
    //      DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_curitiba.txt"); //squaresOfCuritiba OSM

    //      DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/parks_pref_ny.txt"); //parksOfNY Pref
    //      DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_ny.txt"); //parksOfNY OSM

    StorageManager storagePref = new StorageManager();
    StorageManager storageOSM = new StorageManager();

    // enables in-memory execution for faster processing
    // this can be done since the whole data fits into memory
    storagePref.enableInMemoryProcessing();
    storageOSM.enableInMemoryProcessing();

    // adds the "data" to the algorithm
    storagePref.addDataSource(dataSourcePref);
    storageOSM.addDataSource(dataSourceOSM);

    if (!storagePref.isDataExtracted()) {
        storagePref.extractData();
    }
    if (!storageOSM.isDataExtracted()) {
        storageOSM.extractData();
    }

    List<GeoPolygon> geoentitiesPref = new ArrayList<GeoPolygon>();
    List<GeoPolygon> geoentitiesOSM = new ArrayList<GeoPolygon>();

    // the algorithm returns each generated pair step-by-step
    int indexOfPref = 0;
    for (GenericObject genericObj : storagePref.getExtractedData()) {
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {//for curitiba use atribute "nome" for new york "signname"
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());//for curitiba use atribute "gid" for new york "id"
            geoentitiesPref.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.GOV_POLYGON, indexOfPref, id));
            indexOfPref++;
        }

    }

    int indexOfOSM = 0;
    for (GenericObject genericObj : storageOSM.getExtractedData()) {
        //               System.out.println(genericObj.getData().get("geometry"));
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());
            geoentitiesOSM.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.OSM_POLYGON, indexOfOSM, id));
            indexOfOSM++;
        }

    }

    JavaRDD<GeoPolygon> polygonsOSM = ctx.parallelize(geoentitiesOSM);
    JavaRDD<GeoPolygon> polygonsPref = ctx.parallelize(geoentitiesPref);

    JavaRDD<GeoPolygon> polygons = polygonsPref.union(polygonsOSM);

    final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
    JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons
            .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() {

                public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception {
                    List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>();
                    if (s.getType().equals(InputTypes.OSM_POLYGON)) {
                        listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(
                                s.getIdGeometry() % numReplication.getValue(), s));
                        return listOfPolygonTuple.iterator();
                    } else { //equals to InputTypes.GOV_POLYGON
                        for (int i = 0; i < numReplication.value(); i++) {
                            listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, s));
                        }
                        return listOfPolygonTuple.iterator();
                    }
                }

            });

    JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed
            .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() {

                public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception {
                    return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2());
                }
            });

    JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

    JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
            new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple)
                        throws Exception {
                    List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                    List<GeoPolygon> polygonsSource = new ArrayList<GeoPolygon>();
                    List<GeoPolygon> polygonsTarget = new ArrayList<GeoPolygon>();
                    for (GeoPolygon entity : polygonsPerKey) {
                        if (entity.getType() == InputTypes.OSM_POLYGON) {
                            polygonsSource.add(entity);
                        } else {
                            polygonsTarget.add(entity);
                        }
                    }

                    List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                    JaccardSimilarity jaccard = new JaccardSimilarity();
                    for (GeoPolygon entSource : polygonsSource) {
                        for (GeoPolygon entTarget : polygonsTarget) {
                            double linguisticSimilarity = 0.0;
                            //calculate the linguistic similarity
                            if (!entTarget.getGeoName().isEmpty()) {
                                linguisticSimilarity = jaccard.getSimilarity(
                                        entTarget.getGeoName().toLowerCase(),
                                        entSource.getGeoName().toLowerCase());
                            }

                            //calculate the polygon similarity
                            double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                            //classification of pairs
                            PolygonPair pair;
                            if (linguisticSimilarity > thresholdLinguistic
                                    && polygonSimilarity > thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.MATCH);
                            } else if (linguisticSimilarity < thresholdLinguistic
                                    && polygonSimilarity < thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.NON_MATCH);
                            } else {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                            }

                            //                  int index = entityMatches.size();
                            //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                            //for use case 04
                            if (pair.getPolygonClassification().equals(PolygonClassification.POSSIBLE_PROBLEM)
                                    || pair.getPolygonClassification().equals(PolygonClassification.MATCH)) {
                                int index = entityMatches.size();
                                entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                            }

                            //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                            //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                            ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                            ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                            ////                        System.out.println();
                            //                     }
                        }
                    }
                    return entityMatches.iterator();
                }
            });

    matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

        public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
            ArrayList<String> listOutput = new ArrayList<String>();
            listOutput.add(t._2().toStringCSV());
            return listOutput.iterator();
        }

    }).saveAsTextFile(outputPath);

    ctx.stop();
    ctx.close();
}

From source file:SingleMatchingGeoPolygon.SingleMatchingGeoPolygon.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    String dataSource = args[0];//www  .j  av a2 s  .  co m
    final double thresholdLinguistic = Double.parseDouble(args[1]);
    final double thresholdPolygon = Double.parseDouble(args[2]);
    String outputPath = args[3];
    Integer amountPartition = Integer.parseInt(args[4]);
    String sourceType = args[5];

    DataSource source1 = null;
    if (sourceType.equals("CSV")) {
        source1 = AbstractExec.getDataCSV(dataSource, ';');
    } else { //is postgis
        source1 = AbstractExec.getDataPostGres(dataSource);
    }

    ReadAbstractSource reader = new ReadAbstractSource();
    StorageManager storagePolygon = reader.readFile(source1);

    List<GeoPolygon> geoentities = new ArrayList<GeoPolygon>();

    int index = 0;
    for (GenericObject genericObj : storagePolygon.getExtractedData()) {
        //               System.out.println(genericObj.getData().get("geometry"));
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());
            geoentities.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.OSM_POLYGON, index, id));
            index++;
        }
    }

    JavaRDD<GeoPolygon> polygons = ctx.parallelize(geoentities);

    final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
    JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons
            .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() {

                public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception {
                    List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>();
                    GeoPolygon tocompare = s.getGeoPolygon();
                    tocompare.setDuplicated(false);
                    listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(
                            tocompare.getIdGeometry() % numReplication.getValue(), tocompare));//entity that not replicated

                    GeoPolygon duplicated = s.getGeoPolygon();
                    duplicated.setDuplicated(true);
                    for (int i = 0; i < numReplication.value(); i++) {//the entities that will be replicated
                        listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, duplicated));
                    }
                    return listOfPolygonTuple.iterator();
                }

            });

    JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed
            .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() {

                public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception {
                    return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2());
                }
            });

    JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

    JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
            new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple)
                        throws Exception {
                    List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                    List<GeoPolygon> polygonsToCompare = new ArrayList<GeoPolygon>();
                    List<GeoPolygon> polygonsDuplicated = new ArrayList<GeoPolygon>();
                    for (GeoPolygon entity : polygonsPerKey) {
                        if (entity.isDuplicated()) {
                            polygonsDuplicated.add(entity);
                        } else {
                            polygonsToCompare.add(entity);
                        }
                    }

                    List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                    JaccardSimilarity jaccard = new JaccardSimilarity();
                    for (GeoPolygon entSource : polygonsToCompare) {
                        for (GeoPolygon entTarget : polygonsDuplicated) {
                            double linguisticSimilarity = 0.0;
                            //calculate the linguistic similarity
                            if (!entTarget.getGeoName().isEmpty()) {
                                linguisticSimilarity = jaccard.getSimilarity(
                                        entTarget.getGeoName().toLowerCase(),
                                        entSource.getGeoName().toLowerCase());
                            }

                            //calculate the polygon similarity
                            double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                            //classification of pairs
                            PolygonPair pair;
                            if (linguisticSimilarity > thresholdLinguistic
                                    && polygonSimilarity > thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.MATCH);
                            } else if (linguisticSimilarity < thresholdLinguistic
                                    && polygonSimilarity < thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.NON_MATCH);
                            } else {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                            }

                            //                  int index = entityMatches.size();
                            //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                            //for use case 04
                            if (pair.getPolygonClassification().equals(PolygonClassification.MATCH) && (pair
                                    .getSource().getIdInDataset() != pair.getTarget().getIdInDataset())) {
                                int index = entityMatches.size();
                                entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                            }

                            //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                            //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                            ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                            ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                            ////                        System.out.println();
                            //                     }
                        }
                    }
                    return entityMatches.iterator();
                }
            });

    matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

        public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
            ArrayList<String> listOutput = new ArrayList<String>();
            listOutput.add(t._2().toStringCSV());
            return listOutput.iterator();
        }

    }).saveAsTextFile(outputPath);

    ctx.stop();
    ctx.close();
}

From source file:SingleMatchingGeoPolygon.SingleMatchingGeoPolygonBlocked.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
    SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    String dataSource = args[0];//from   w  w w  . ja  va  2 s  .c  o m
    final double thresholdLinguistic = Double.parseDouble(args[1]);
    final double thresholdPolygon = Double.parseDouble(args[2]);
    String outputPath = args[3];
    Integer amountPartition = Integer.parseInt(args[4]);
    String sourceType = args[5];

    DataSource source1 = null;
    if (sourceType.equals("CSV")) {
        source1 = AbstractExec.getDataCSV(dataSource, ';');
    } else { //is postgis
        source1 = AbstractExec.getDataPostGres(dataSource);
    }

    ReadAbstractSource reader = new ReadAbstractSource();
    StorageManager storagePolygon = reader.readFile(source1);
    //      StorageManager storagePolygon = reader.readFile(AbstractExec.getDataPostGres("queries/osm_curitiba.txt"));
    //      StorageManager storagePolygon = reader.readFile(AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt"));

    List<GeoPolygon> geoentities = new ArrayList<GeoPolygon>();

    int index = 0;
    for (GenericObject genericObj : storagePolygon.getExtractedData()) {
        //               System.out.println(genericObj.getData().get("geometry"));
        String nome = "";
        Integer id;
        if (!genericObj.getData().get("name").toString().equals("null")) {
            nome = genericObj.getData().get("name").toString();
            id = Integer.parseInt(genericObj.getData().get("id").toString());
            geoentities.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                    InputTypes.OSM_POLYGON, index, id));
            index++;
        }
    }

    JavaRDD<GeoPolygon> polygons = ctx.parallelize(geoentities);

    Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
    JavaRDD<Tuple2<String, GeoPolygon>> polygonLabed = polygons
            .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<String, GeoPolygon>>() {

                public Iterator<Tuple2<String, GeoPolygon>> call(GeoPolygon s) throws Exception {
                    List<Tuple2<String, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<String, GeoPolygon>>();
                    GeoPolygon tocompare = s.getGeoPolygon();
                    tocompare.setDuplicated(false);
                    if (tocompare.getGeoName().length() < 3) {
                        listOfPolygonTuple
                                .add(new Tuple2<String, GeoPolygon>(tocompare.getGeoName(), tocompare));//entity that not replicated
                    } else {
                        listOfPolygonTuple.add(new Tuple2<String, GeoPolygon>(
                                tocompare.getGeoName().substring(0, 3), tocompare));//entity that not replicated
                    }

                    GeoPolygon duplicated = s.getGeoPolygon();
                    duplicated.setDuplicated(true);
                    if (duplicated.getGeoName().length() < 3) {
                        listOfPolygonTuple
                                .add(new Tuple2<String, GeoPolygon>(duplicated.getGeoName(), duplicated));
                    } else {
                        listOfPolygonTuple.add(new Tuple2<String, GeoPolygon>(
                                duplicated.getGeoName().substring(0, 3), duplicated));
                    }

                    //            for (int i = 0; i < numReplication.value(); i++) {//the entities that will be replicated
                    //               listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(duplicated, duplicated));
                    //            }
                    return listOfPolygonTuple.iterator();
                }

            });

    JavaPairRDD<String, GeoPolygon> polygonsPaired = polygonLabed
            .mapToPair(new PairFunction<Tuple2<String, GeoPolygon>, String, GeoPolygon>() {

                public Tuple2<String, GeoPolygon> call(Tuple2<String, GeoPolygon> tuple) throws Exception {
                    return new Tuple2<String, GeoPolygon>(tuple._1(), tuple._2());
                }
            });

    JavaPairRDD<String, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

    JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
            new PairFlatMapFunction<Tuple2<String, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<String, Iterable<GeoPolygon>> tuple)
                        throws Exception {
                    List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                    List<GeoPolygon> polygonsToCompare = new ArrayList<GeoPolygon>();
                    List<GeoPolygon> polygonsDuplicated = new ArrayList<GeoPolygon>();
                    for (GeoPolygon entity : polygonsPerKey) {
                        if (entity.isDuplicated()) {
                            polygonsDuplicated.add(entity);
                        } else {
                            polygonsToCompare.add(entity);
                        }
                    }

                    List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                    JaccardSimilarity jaccard = new JaccardSimilarity();
                    for (GeoPolygon entSource : polygonsToCompare) {
                        for (GeoPolygon entTarget : polygonsDuplicated) {
                            double linguisticSimilarity = 0.0;
                            //calculate the linguistic similarity
                            if (!entTarget.getGeoName().isEmpty()) {
                                linguisticSimilarity = jaccard.getSimilarity(
                                        entTarget.getGeoName().toLowerCase(),
                                        entSource.getGeoName().toLowerCase());
                            }

                            //calculate the polygon similarity
                            double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                            //classification of pairs
                            PolygonPair pair;
                            if (linguisticSimilarity > thresholdLinguistic
                                    && polygonSimilarity > thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.MATCH);
                            } else if (linguisticSimilarity < thresholdLinguistic
                                    && polygonSimilarity < thresholdPolygon) {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.NON_MATCH);
                            } else {
                                pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                        polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                            }

                            //                  int index = entityMatches.size();
                            //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                            //for use case 04
                            if (pair.getPolygonClassification().equals(PolygonClassification.MATCH) && (pair
                                    .getSource().getIdInDataset() != pair.getTarget().getIdInDataset())) {
                                int index = entityMatches.size();
                                entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                            }

                            //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                            //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                            ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                            ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                            ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                            ////                        System.out.println();
                            //                     }
                        }
                    }
                    return entityMatches.iterator();
                }
            });

    matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

        public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
            ArrayList<String> listOutput = new ArrayList<String>();
            listOutput.add(t._2().toStringCSV());
            return listOutput.iterator();
        }

    }).saveAsTextFile(outputPath);

    ctx.stop();
    ctx.close();
}

From source file:SparkExamples.SparkPR.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);/* www  .  ja v a 2  s.  c o m*/
    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("SparkPR");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     ...
    long start = System.currentTimeMillis();

    JavaRDD<String> lines = ctx.textFile(args[0], 1);

    int partition = Integer.parseInt(args[3]);

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            parts[0] = parts[0].toString()
                    .replaceAll("AAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZ", "");
            parts[1] = parts[1].toString()
                    .replaceAll("AAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZ", "");
            return new Tuple2<String, String>(parts[0], parts[1]);
        }
    }).groupByKey(partition).persist(StorageLevel.MEMORY_AND_DISK());

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
                        for (String n : s._1) {
                            results.add(new Tuple2<String, Double>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    ranks.foreach(p -> System.out.println(p));

    long end = System.currentTimeMillis();
    System.out.println("running time " + (end - start) / 1000 + "s");

    String results = "running time " + (new Double((end - start) / 1000)).toString() + "s" + "input file : "
            + args[0] + ", iteration : " + args[1];
    System.out.println(results);
    String outputurl = ".//results.txt";
    BufferedWriter writer = new BufferedWriter(new FileWriter(outputurl, true));
    writer.write(results);
    writer.newLine();
    writer.close();

    ctx.stop();
}

From source file:streaming.NginxlogSorter.java

License:Apache License

public static void main(String[] args) {
    JavaSparkContext sc = null;//from  ww w. ja va  2s.  c  om
    try {
        SparkConf conf = new SparkConf().setMaster("local").setAppName("NginxlogSorter");
        //   conf.set("hadoop.home.dir", "/usr/local/hadoop/hadoop-2.6.0");

        sc = new JavaSparkContext(conf);

        JavaRDD<String> lines = sc.textFile("src/test/resources/nginx_report.txt");

        lines.map(new Function<String, String>() {

            @Override
            public String call(String s) throws Exception {
                log.info(s);
                return null;
            }

        });

        JavaPairRDD<String, Integer> items = lines.mapToPair(new PairFunction<String, String, Integer>() {

            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {
                log.info(s);
                return null;
            }

        });

        lines.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {

            @Override
            public Iterable<Tuple2<String, Integer>> call(String t) throws Exception {

                log.info(">>>: {}", t);
                return null;
            }

        });
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        sc.close();
    }
}

From source file:univ.bigdata.course.pagerank.JavaPageRank.java

License:Apache License

public static void calculatePageRank(JavaRDD<String> rdd, int iteration_number) throws Exception {
    // Loads in input file. It should be in format of:
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     URL         neighbor URL
    //     .../*w w  w .  j  a va2s  .co m*/
    JavaRDD<String> lines = rdd;

    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {
        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < iteration_number; current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
                    @Override
                    public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                        int urlCount = Iterables.size(s._1);
                        List<Tuple2<String, Double>> results = new ArrayList<>();
                        for (String n : s._1) {
                            results.add(new Tuple2<>(n, s._2() / urlCount));
                        }
                        return results;
                    }
                });

        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }

    JavaPairRDD<Double, String> sortedRanks = ranks
            .mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() {

                @Override
                public Tuple2<Double, String> call(Tuple2<String, Double> t) {
                    return new Tuple2<Double, String>(t._2, t._1);
                }
            }).sortByKey(false);

    // Collects all URL ranks and dump them to console.
    List<Tuple2<Double, String>> output = sortedRanks.takeOrdered(100, PageRankComperator.VALUE_COMP);
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._2() + " has rank: " + tuple._1() + ".");
    }
}

From source file:weka.distributed.spark.CorrelationMatrixSparkJob.java

License:Open Source License

/**
 * Build the correlation matrix and write it to the output destination
 * /*from  w  w w  .  j a  v  a  2 s  .  c o  m*/
 * @param dataset the input RDD dataset to use
 * @param headerWithSummary the header of the data (with summary attributes)
 * @param outputPath the path to write results to
 * @throws Exception if a problem occurs
 */
protected void buildMatrix(JavaRDD<Instance> dataset, final Instances headerWithSummary, String outputPath)
        throws Exception {

    String matrixMapOpts = getCorrelationMapTaskOptions();
    String[] mapOpts = null;
    if (!DistributedJobConfig.isEmpty(matrixMapOpts)) {
        mapOpts = Utils.splitOptions(environmentSubstitute(matrixMapOpts));
    }
    final String[] fMapOpts = mapOpts;

    // construct a temporary map task in order to determine how
    // many rows there will be in the matrix (after deleting any
    // nominal atts and potentially the class att)
    CorrelationMatrixMapTask tempTask = new CorrelationMatrixMapTask();
    if (fMapOpts != null) {
        tempTask.setOptions(fMapOpts.clone());
    }
    final boolean missingReplacedWithMean = !tempTask.getIgnoreMissingValues();
    final boolean covarianceInsteadOfCorrelation = tempTask.getCovariance();
    final boolean deleteClassIfSet = !tempTask.getKeepClassAttributeIfSet();

    tempTask.setup(headerWithSummary);
    final int numRowsInMatrix = tempTask.getMatrix().length;

    JavaPairRDD<Integer, MatrixRowHolder> mapToPartialRows = dataset
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, MatrixRowHolder>() {

                /** For serialization */
                private static final long serialVersionUID = -3024936415666668127L;

                protected List<Tuple2<Integer, MatrixRowHolder>> m_partialRows = new ArrayList<Tuple2<Integer, MatrixRowHolder>>();

                @Override
                public Iterable<Tuple2<Integer, MatrixRowHolder>> call(Iterator<Instance> split)
                        throws DistributedWekaException {

                    CorrelationMatrixMapTask task = new CorrelationMatrixMapTask();
                    try {
                        if (fMapOpts != null) {
                            task.setOptions(fMapOpts);
                        }
                        task.setup(headerWithSummary);

                        while (split.hasNext()) {
                            task.processInstance(split.next());
                        }

                        // output all the rows in this partial matrix
                        double[][] partialMatrix = task.getMatrix();
                        int[][] coOcc = task.getCoOccurrenceCounts();
                        for (int i = 0; i < partialMatrix.length; i++) {
                            double[] row = partialMatrix[i];
                            int[] co = null;
                            if (coOcc != null) {
                                co = coOcc[i];
                            }
                            MatrixRowHolder rh = new MatrixRowHolder(i, row, co);
                            m_partialRows.add(new Tuple2<Integer, MatrixRowHolder>(i, rh));
                        }
                    } catch (Exception ex) {
                        throw new DistributedWekaException(ex);
                    }

                    return m_partialRows;
                }
            }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRowsInMatrix))
            .persist(getCachingStrategy().getStorageLevel());

    JavaPairRDD<Integer, double[]> aggregatedRows = mapToPartialRows.mapPartitionsToPair(
            new PairFlatMapFunction<Iterator<Tuple2<Integer, MatrixRowHolder>>, Integer, double[]>() {

                /** For serialization */
                private static final long serialVersionUID = -1290972198473290092L;
                protected List<Tuple2<Integer, double[]>> result = new ArrayList<Tuple2<Integer, double[]>>();

                @Override
                public Iterable<Tuple2<Integer, double[]>> call(
                        Iterator<Tuple2<Integer, MatrixRowHolder>> split) throws DistributedWekaException {

                    List<double[]> partials = new ArrayList<double[]>();
                    List<int[]> partialCoOcc = new ArrayList<int[]>();
                    int rowNum = -1;
                    while (split.hasNext()) {
                        Tuple2<Integer, MatrixRowHolder> nextRow = split.next();
                        if (rowNum < 0) {
                            rowNum = nextRow._2().getRowNumber();
                        } else {
                            if (nextRow._2().getRowNumber() != rowNum) {
                                throw new DistributedWekaException("Was not expecting the matrix row number "
                                        + "to change within a partition!");
                            }

                            partials.add(nextRow._2().getRow());
                            if (!missingReplacedWithMean) {
                                partialCoOcc.add(nextRow._2().getCoOccurrencesCounts());
                            }
                        }
                    }

                    if (partials.size() > 0) {
                        CorrelationMatrixRowReduceTask reducer = new CorrelationMatrixRowReduceTask();

                        double[] aggregated = reducer.aggregate(rowNum, partials, partialCoOcc,
                                headerWithSummary, missingReplacedWithMean, covarianceInsteadOfCorrelation,
                                deleteClassIfSet);
                        result.add(new Tuple2<Integer, double[]>(rowNum, aggregated));
                    }

                    return result;
                }
            });

    List<Tuple2<Integer, double[]>> reducedRows = aggregatedRows.collect();
    mapToPartialRows.unpersist();

    double[][] m = new double[reducedRows.size()][reducedRows.size()];
    for (Tuple2<Integer, double[]> row : reducedRows) {
        int i = row._1();
        double[] js = row._2();

        for (int j = 0; j < js.length; j++) {
            m[i][j] = js[j];
            m[j][i] = js[j];
        }
    }

    m_finalMatrix = new weka.core.matrix.Matrix(m);

    Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);
    try {
        writeMatrixToOutput(outputPath, reducedRows, headerNoSummary, deleteClassIfSet);
    } catch (Exception ex) {
        logMessage(ex);
        throw new DistributedWekaException(ex);
    }

    if (getRunPCA()) {
        runPCA(outputPath, covarianceInsteadOfCorrelation, !deleteClassIfSet, headerWithSummary,
                headerNoSummary);
    }
}

From source file:weka.distributed.spark.KMeansClustererSparkJob.java

License:Open Source License

/**
 * Perform an iteration of k-means/*from   w  w  w. j a  v  a  2 s  .  co  m*/
 *
 * @param dataset the dataset to operate on
 * @param mapTasks the underlying map tasks to use - one for each separate run
 *          of k-means that we're doing in parallel
 * @param converged array indicating which runs have converged
 * @param iterationNum the iteration number that we're up to
 * @param transformedHeaderNoSummary the header of the training data (sans
 *          summary attributes)
 * @return a list of KMeansReduceTasks encapsulating the results of the
 *         iteration for each active run of k-means
 * @throws DistributedWekaException if a problem occurs
 */
protected List<Tuple2<Integer, KMeansReduceTask>> performKMeansIteration(JavaRDD<Instance> dataset,
        final KMeansMapTask[] mapTasks, final boolean[] converged, final int iterationNum,
        final Instances transformedHeaderNoSummary) throws DistributedWekaException {

    final int numRuns = mapTasks.length;

    // keyed by run, a list of partial centroid summary instances
    // - one Instances object for each centroid (may be null if a
    // given centroid did not get any instances assigned to it)
    JavaPairRDD<Integer, List<Instances>> mapRuns = dataset
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, List<Instances>>() {

                /**
                 * For serialization
                 */
                private static final long serialVersionUID = 6063661312796545915L;

                protected List<Tuple2<Integer, List<Instances>>> m_centroidStatsForRuns = new ArrayList<Tuple2<Integer, List<Instances>>>();

                @Override
                public Iterable<Tuple2<Integer, List<Instances>>> call(Iterator<Instance> split)
                        throws DistributedWekaException {

                    while (split.hasNext()) {
                        Instance current = split.next();

                        for (int k = 0; k < numRuns; k++) {
                            if (!converged[k]) {
                                mapTasks[k].processInstance(current);
                            }
                        }
                    }

                    for (int k = 0; k < numRuns; k++) {
                        if (!converged[k]) {
                            List<Instances> centroidStatsForRun = mapTasks[k].getCentroidStats();
                            m_centroidStatsForRuns
                                    .add(new Tuple2<Integer, List<Instances>>(k, centroidStatsForRun));
                        }
                    }

                    return m_centroidStatsForRuns;
                }
            }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns))
            .persist(StorageLevel.MEMORY_AND_DISK());

    mapRuns.count();

    // Reduce. Need to aggregate all the cluster stats
    // for each run. Do we repartition into numRuns partitions and then
    // run another mapPartitions phase? With our custom partitioner this
    // should guarantee that a partition only contains the lists of instances
    // for one run. Can't use partitionByKey because CSVReduce is not
    // associative, and needs to see the whole list of summary instances
    // objects for one run, cluster# (need to run a separate reduce for
    // each cluster centroid within each run anyway). Then update the
    // final error for each centroid in each run and the total error
    // (sum of errors over centroids for a run)
    JavaPairRDD<Integer, KMeansReduceTask> reducedByRun = mapRuns.mapPartitionsToPair(
            new PairFlatMapFunction<Iterator<Tuple2<Integer, List<Instances>>>, Integer, KMeansReduceTask>() {

                /**
                 * For serialization
                 */
                private static final long serialVersionUID = -747645603149767637L;

                protected List<Tuple2<Integer, KMeansReduceTask>> m_resultsForRun = new ArrayList<Tuple2<Integer, KMeansReduceTask>>();

                @Override
                public Iterable<Tuple2<Integer, KMeansReduceTask>> call(
                        Iterator<Tuple2<Integer, List<Instances>>> split) throws DistributedWekaException {

                    List<List<Instances>> partialsForRun = new ArrayList<List<Instances>>();
                    int runNumber = -1;

                    while (split.hasNext()) {
                        Tuple2<Integer, List<Instances>> partial = split.next();
                        if (runNumber < 0) {
                            runNumber = partial._1().intValue();
                        } else {
                            if (partial._1().intValue() != runNumber) {
                                throw new DistributedWekaException("[k-means] reduce phase: "
                                        + "was not expecting the run number to change within a "
                                        + "partition!");
                            }
                        }

                        partialsForRun.add(partial._2());
                    }

                    KMeansReduceTask reducer = new KMeansReduceTask();

                    // size might be zero if we are operating on a partition for a
                    // run that has already converged (in which case there will be no
                    // data in this partition)...
                    if (partialsForRun.size() > 0) {
                        reducer.reduceClusters(runNumber, iterationNum, transformedHeaderNoSummary,
                                partialsForRun);
                        m_resultsForRun.add(new Tuple2<Integer, KMeansReduceTask>(runNumber, reducer));
                    }

                    return m_resultsForRun;
                }
            });

    List<Tuple2<Integer, KMeansReduceTask>> runResults = reducedByRun.collect();
    mapRuns.unpersist();
    reducedByRun.unpersist();

    return runResults;
}

From source file:weka.distributed.spark.KMeansClustererSparkJob.java

License:Open Source License

/**
 * Perform the k-means|| initialization process
 *
 * @param dataset the dataset to operate on
 * @param headerWithSummary the header of the data, with summary attributes
 * @param numRuns the number of separate runs of k-means to be performed in
 *          parallel//from  w w  w  .  j  a  v  a 2 s . c om
 * @param numClusters the number of clusters to generate
 * @return a list of Instances objects, where each Instances object contains
 *         the starting points for one run of k-means
 * @throws IOException if a problem occurs
 * @throws DistributedWekaException if a problem occurs
 */
protected List<Instances> initializeWithKMeansParallel(JavaRDD<Instance> dataset, Instances headerWithSummary,
        final int numRuns, int numClusters) throws IOException, DistributedWekaException {

    int numSteps = Integer.parseInt(environmentSubstitute(getKMeansParallelInitSteps()));

    // random seed option
    int randomSeed = 1;
    if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
        try {
            randomSeed = Integer.parseInt(environmentSubstitute(getRandomSeed()));
        } catch (NumberFormatException ex) {
            // don't fuss
        }
    }

    // 1) start with 1 randomly chosen point for each run
    // 2) run sketch for x iterations (aggregating reservoirs for each
    // run at the end of each iteration (i.e. reservoirs for run 1
    // on each split of the data, reservoirs for run 2, etc.)
    // 3) Get final sketch for each run
    // 4) Weight each point in each sketch by the number of points
    // in the data that cluster to it
    // 5) Run local KMeans on data weighted data to obtain final k
    // starting centers

    // Step 1: start with 1 randomly chosen point for each run
    List<Instances> randomSingleCenters = initializeWithRandomCenters(dataset, headerWithSummary, numRuns, 1);

    // Step 2: run sketch for x iterations (aggregating reservoirs for each
    // run at the end of each iteration (i.e. reservoirs for run 1
    // on each split of the data, reservoirs for run 2, etc.)
    Instances tmpTrans = null;
    // one configured task per run (we'll use this for an initial distance
    // function and for step 4 where we need to cluster all the points to
    // get cluster sizes
    final KMeansMapTask[] mapTasks = new KMeansMapTask[numRuns];
    for (int i = 0; i < numRuns; i++) {
        mapTasks[i] = new KMeansMapTask();

        try {
            mapTasks[i].setOptions(Utils.splitOptions(getKMeansMapTaskOpts()));
        } catch (Exception e) {
            throw new DistributedWekaException(e);
        }
        tmpTrans = mapTasks[i].init(headerWithSummary);
    }

    // transformed header (has passed through filters)
    final Instances transformedHeaderNoSummary = tmpTrans;

    NormalizableDistance distanceFunc = mapTasks[0].getDistanceFunction();
    final CentroidSketch[] sketches = new CentroidSketch[numRuns];
    // initialize sketches
    for (int i = 0; i < numRuns; i++) {
        try {
            // apply any filters
            Instances transformedStartSketch = randomSingleCenters.get(i);
            // mapTasks[0].applyFilters(randomSingleCenters.get(i));

            sketches[i] = new CentroidSketch(transformedStartSketch, distanceFunc, 2 * numClusters,
                    randomSeed + i);
        } catch (Exception ex) {
            logMessage(ex);
            throw new DistributedWekaException(ex);
        }
    }

    // this is used when processing instances in partitions to
    // ensure that each instance from the data set gets
    // filtered appropriately
    final KMeansMapTask forFilteringOnly = mapTasks[0];

    for (int i = 0; i < numSteps; i++) {
        logMessage("[k-means] Running iteration " + (i + 1) + " of k-means|| initialization procedure.");
        final int iterationNum = i;

        // keyed by run, a list of partial sketches
        // - one CentroidSketch object for each run in each partition
        JavaPairRDD<Integer, CentroidSketch> mapRuns = dataset
                .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, CentroidSketch>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = 6063661312796545915L;

                    protected List<Tuple2<Integer, CentroidSketch>> m_centroidSketchesForRuns = new ArrayList<Tuple2<Integer, CentroidSketch>>();

                    @Override
                    public Iterable<Tuple2<Integer, CentroidSketch>> call(Iterator<Instance> split)
                            throws DistributedWekaException {

                        while (split.hasNext()) {
                            Instance current = split.next();
                            try {
                                // make sure it goes through any filters first!
                                current = forFilteringOnly.applyFilters(current);
                            } catch (Exception ex) {
                                throw new DistributedWekaException(ex);
                            }

                            for (int k = 0; k < numRuns; k++) {
                                sketches[k].process(current, iterationNum == 0);
                            }
                        }

                        for (int k = 0; k < numRuns; k++) {
                            m_centroidSketchesForRuns.add(new Tuple2<Integer, CentroidSketch>(k, sketches[k]));

                        }

                        return m_centroidSketchesForRuns;
                    }
                }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns))
                .persist(StorageLevel.MEMORY_AND_DISK());

        mapRuns.count();

        // Each partion of mapRuns now contains partials for just one run.
        // Here we aggregate the partials per run
        JavaPairRDD<Integer, CentroidSketch> reducedByRun = mapRuns.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, CentroidSketch>>, Integer, CentroidSketch>() {

                    /** For serialization */
                    private static final long serialVersionUID = 7689178383188695493L;

                    protected List<Tuple2<Integer, CentroidSketch>> m_resultsForRun = new ArrayList<Tuple2<Integer, CentroidSketch>>();

                    @Override
                    public Iterable<Tuple2<Integer, CentroidSketch>> call(
                            Iterator<Tuple2<Integer, CentroidSketch>> split) throws DistributedWekaException {

                        int runNumber = -1;
                        CentroidSketch initial = null;
                        List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>();

                        while (split.hasNext()) {
                            Tuple2<Integer, CentroidSketch> partial = split.next();
                            if (runNumber < 0) {
                                runNumber = partial._1().intValue();
                            } else {
                                if (partial._1().intValue() != runNumber) {
                                    throw new DistributedWekaException("[k-means] k-means|| initialization: "
                                            + "was not expecting the run number to change within "
                                            + "a partition!");
                                }
                            }

                            if (initial == null) {
                                initial = partial._2();
                            } else {
                                try {
                                    initial.aggregateReservoir(partial._2().getReservoirSample());
                                } catch (Exception e) {
                                    throw new DistributedWekaException(e);
                                }
                            }

                            // get all the distance functions and
                            // compute priming data that has global
                            // min and maxes.
                            if (iterationNum == 0) {
                                // only need to determine global distance function
                                // priming data once (i.e. in the first iteration of
                                // the k-means|| process)
                                distsForRun.add(partial._2().getDistanceFunction());
                            }
                        }

                        // update the distance function with global numeric
                        // attribute ranges
                        if (distsForRun.size() > 0) {
                            Instances distancePrimingData = KMeansReduceTask
                                    .computeDistancePrimingDataFromDistanceFunctions(distsForRun,
                                            transformedHeaderNoSummary);
                            initial.getDistanceFunction().setInstances(distancePrimingData);
                        }

                        m_resultsForRun.add(new Tuple2<Integer, CentroidSketch>(runNumber, initial));

                        return m_resultsForRun;
                    }
                });

        List<Tuple2<Integer, CentroidSketch>> runResults = reducedByRun.collect();
        mapRuns.unpersist();
        mapRuns = null;

        for (Tuple2<Integer, CentroidSketch> r : runResults) {
            int runNum = r._1().intValue();
            sketches[runNum] = r._2();

            // add the current contents of the reservoir to the sketch
            // for each run
            try {
                sketches[runNum].addReservoirToCurrentSketch();

                if (m_debug) {
                    logMessage("[k-means] Iteration: " + i + " - number of instances in sketch: "
                            + sketches[runNum].getCurrentSketch().numInstances() + "\n"
                            + sketches[runNum].getCurrentSketch());
                }
            } catch (Exception ex) {
                logMessage(ex);
                throw new DistributedWekaException(ex);
            }
        }
        reducedByRun.unpersist();
    }

    // perform and aggregate clustering using the final sketch results
    // so that we can find out how many points are assigned to
    // each instance in the sketch.
    Instances globalPriming = sketches[0].getDistanceFunction().getInstances();
    if (globalPriming.numInstances() != 2) {
        logMessage("[k-means] Error: as expecting a two instance "
                + "(global priming data) dataset to be set in the distance function " + "in each sketch!");
        throw new DistributedWekaException("Was expecting a two instance (global priming data)"
                + " dataset to be set in the distance function in each sketch!");
    }
    for (int i = 0; i < numRuns; i++) {
        // set sketches as centers for map tasks
        // in preparation for clustering (so that we can)
        // find out how many training points get assigned to
        // each center

        mapTasks[i].setCentroids(sketches[i].getCurrentSketch());
        mapTasks[i].setDummyDistancePrimingData(globalPriming);
    }

    // 3 & 4) Get final sketch for each run and weight each point in
    // the sketch by the number of training instances that cluster to it
    List<Tuple2<Integer, KMeansReduceTask>> clusterAssignments = performKMeansIteration(dataset, mapTasks,
            new boolean[numRuns], 1, transformedHeaderNoSummary);

    List<Instances> finalStartPointsForRuns = new ArrayList<Instances>();
    for (int i = 0; i < numRuns; i++) {
        int rN = clusterAssignments.get(i)._1().intValue();
        List<Instances> centroidSummaries = clusterAssignments.get(i)._2().getAggregatedCentroidSummaries();

        Instances sketchForRun = sketches[i].getCurrentSketch();

        // empty clusters shouldn't be a problem - in
        // one iteration each sketch member should at minimum
        // have itself assigned (i.e. count >= 1). NOTE: The only exception
        // could occur if the sketch contains duplicate instances. However,
        // this shouldn't happen within a single WeightedReservoirSampling
        // as candidate instances with weight 0 (i.e. distance 0 to the sketch
        // in this case) are never added to the sketch.
        if (centroidSummaries.size() != sketchForRun.numInstances()) {
            logMessage("[k-means] Error: was expecting as " + "many summary headers as \n"
                    + "there are center candidates in the sketch for run " + rN);
            throw new DistributedWekaException("Was expecting as many summary headers as "
                    + "there are center candidates in the sketch for run " + rN);
        }

        for (int j = 0; j < sketchForRun.numInstances(); j++) {
            Instance centerCandidate = sketchForRun.instance(j);
            Instances centerStats = centroidSummaries.get(j);
            double weightForCandidate = -1.0;
            // now grab the first summary attribute and get count
            for (int k = 0; k < sketchForRun.numAttributes(); k++) {

                if (sketchForRun.attribute(k).isNumeric()) {
                    Attribute statsAtt = centerStats
                            .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                                    + sketchForRun.attribute(k).name());
                    weightForCandidate = ArffSummaryNumericMetric.COUNT.valueFromAttribute(statsAtt)
                            + ArffSummaryNumericMetric.MISSING.valueFromAttribute(statsAtt);
                    break;
                } else if (sketchForRun.attribute(k).isNominal()) {
                    Attribute statsAtt = centerStats
                            .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                                    + sketchForRun.attribute(k).name());
                    NominalStats ns = NominalStats.attributeToStats(statsAtt);
                    weightForCandidate = 0;
                    for (String s : ns.getLabels()) {
                        weightForCandidate += ns.getCount(s);
                    }
                    weightForCandidate += ns.getNumMissing();
                }
            }

            if (weightForCandidate < 0) {
                logMessage("[k-means] Error: unable to compute the " + "number of training instances "
                        + "assigned to sketch member " + j + " in run " + i);
                throw new DistributedWekaException("Unable to compute the number of training instances "
                        + "assigned to sketch member " + j + " in run " + i);
            }

            // finally - set the weight
            centerCandidate.setWeight(weightForCandidate);
        }

        if (m_debug) {
            logMessage("Final weighted sketch (run " + i + ") prior to local KMeans:\n" + sketchForRun);
        }

        // now run standard k-means on the weighted sketch to
        // (hopefully) get the requested number of start points
        SimpleKMeans localKMeans = new SimpleKMeans();
        try {
            localKMeans.setNumClusters(numClusters);
            localKMeans.setInitializationMethod(
                    new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
            localKMeans.buildClusterer(sketchForRun);
            finalStartPointsForRuns.add(localKMeans.getClusterCentroids());
        } catch (Exception ex) {
            logMessage(ex);
            throw new DistributedWekaException(ex);
        }
    }

    m_distanceFunctionPrimingData = globalPriming;

    return finalStartPointsForRuns;
}