List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction
PairFlatMapFunction
From source file:org.springframework.data.hadoop.batch.spark.test.SparkHashtags.java
License:Apache License
public static void main(String[] args) { String fileName = ""; if (args.length > 0) { fileName = args[0];/*from www . ja va2s .com*/ } SparkConf conf = new SparkConf().setAppName("spark-hashtags"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> tweetData = sc.textFile(fileName).cache(); JavaRDD<Map<String, Object>> tweets = tweetData.map(new Function<String, Map<String, Object>>() { public Map<String, Object> call(String s) throws Exception { return jsonMapper.readValue(s.toString(), new TypeReference<HashMap<String, Object>>() { }); } }); JavaPairRDD<String, Integer> hashTags = tweets .flatMapToPair(new PairFlatMapFunction<Map<String, Object>, String, Integer>() { public Iterable<Tuple2<String, Integer>> call(Map<String, Object> tweet) throws Exception { Map<String, Object> entities = (Map<String, Object>) tweet.get("entities"); List<Map<String, Object>> hashTagEntries = null; if (entities != null) { hashTagEntries = (List<Map<String, Object>>) entities.get("hashtags"); } List<Tuple2<String, Integer>> hashTags = new ArrayList<Tuple2<String, Integer>>(); if (hashTagEntries != null && hashTagEntries.size() > 0) { for (Map<String, Object> hashTagEntry : hashTagEntries) { String hashTag = hashTagEntry.get("text").toString(); hashTags.add(new Tuple2<String, Integer>(hashTag, 1)); } } return hashTags; } }); JavaPairRDD<String, Integer> hashTagCounts = hashTags .reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer int1, Integer int2) throws Exception { return int1 + int2; } }); JavaPairRDD<String, Integer> hashTagCountsSorted = hashTagCounts .mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { public Tuple2<Integer, String> call(Tuple2<String, Integer> in) throws Exception { return new Tuple2<Integer, String>(in._2(), in._1()); } }).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { public Tuple2<String, Integer> call(Tuple2<Integer, String> in) throws Exception { return new Tuple2<String, Integer>(in._2(), in._1()); } }); List<Tuple2<String, Integer>> top10 = hashTagCountsSorted.take(10); System.out.println("HashTags: " + top10); JavaPairRDD<String, Integer> top10Hashtags = sc.parallelizePairs(top10); top10Hashtags.saveAsTextFile("hdfs:///test/spark/output"); sc.stop(); }
From source file:PolygonMatching.MatchingGeoPolygon.java
License:Apache License
public static void main(String[] args) throws Exception { // SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local"); SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); String dataSource1 = args[0]; String dataSource2 = args[1]; final double thresholdLinguistic = Double.parseDouble(args[2]); final double thresholdPolygon = Double.parseDouble(args[3]); String outputPath = args[4];/* w w w. ja v a 2 s . c o m*/ Integer amountPartition = Integer.parseInt(args[5]); String sourceType = args[6]; DataSource dataSourcePref = null; DataSource dataSourceOSM = null; if (sourceType.equals("CSV")) { dataSourcePref = AbstractExec.getDataCSV(dataSource1, ';'); dataSourceOSM = AbstractExec.getDataCSV(dataSource2, ';'); } else { //is postgis dataSourcePref = AbstractExec.getDataPostGres(dataSource1); dataSourceOSM = AbstractExec.getDataPostGres(dataSource2); } // DataSource dataSourcePref = AbstractExec.getDataPostGres(dataSource1); //squaresOfCuritiba Pref // DataSource dataSourceOSM = AbstractExec.getDataPostGres(dataSource2); //squaresOfCuritiba OSM // DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt"); //squaresOfCuritiba Pref // DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_curitiba.txt"); //squaresOfCuritiba OSM // DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/parks_pref_ny.txt"); //parksOfNY Pref // DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_ny.txt"); //parksOfNY OSM StorageManager storagePref = new StorageManager(); StorageManager storageOSM = new StorageManager(); // enables in-memory execution for faster processing // this can be done since the whole data fits into memory storagePref.enableInMemoryProcessing(); storageOSM.enableInMemoryProcessing(); // adds the "data" to the algorithm storagePref.addDataSource(dataSourcePref); storageOSM.addDataSource(dataSourceOSM); if (!storagePref.isDataExtracted()) { storagePref.extractData(); } if (!storageOSM.isDataExtracted()) { storageOSM.extractData(); } List<GeoPolygon> geoentitiesPref = new ArrayList<GeoPolygon>(); List<GeoPolygon> geoentitiesOSM = new ArrayList<GeoPolygon>(); // the algorithm returns each generated pair step-by-step int indexOfPref = 0; for (GenericObject genericObj : storagePref.getExtractedData()) { String nome = ""; Integer id; if (!genericObj.getData().get("name").toString().equals("null")) {//for curitiba use atribute "nome" for new york "signname" nome = genericObj.getData().get("name").toString(); id = Integer.parseInt(genericObj.getData().get("id").toString());//for curitiba use atribute "gid" for new york "id" geoentitiesPref.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome, InputTypes.GOV_POLYGON, indexOfPref, id)); indexOfPref++; } } int indexOfOSM = 0; for (GenericObject genericObj : storageOSM.getExtractedData()) { // System.out.println(genericObj.getData().get("geometry")); String nome = ""; Integer id; if (!genericObj.getData().get("name").toString().equals("null")) { nome = genericObj.getData().get("name").toString(); id = Integer.parseInt(genericObj.getData().get("id").toString()); geoentitiesOSM.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome, InputTypes.OSM_POLYGON, indexOfOSM, id)); indexOfOSM++; } } JavaRDD<GeoPolygon> polygonsOSM = ctx.parallelize(geoentitiesOSM); JavaRDD<GeoPolygon> polygonsPref = ctx.parallelize(geoentitiesPref); JavaRDD<GeoPolygon> polygons = polygonsPref.union(polygonsOSM); final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition); JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() { public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception { List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>(); if (s.getType().equals(InputTypes.OSM_POLYGON)) { listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>( s.getIdGeometry() % numReplication.getValue(), s)); return listOfPolygonTuple.iterator(); } else { //equals to InputTypes.GOV_POLYGON for (int i = 0; i < numReplication.value(); i++) { listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, s)); } return listOfPolygonTuple.iterator(); } } }); JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() { public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception { return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2()); } }); JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair( new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() { public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple) throws Exception { List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator()); List<GeoPolygon> polygonsSource = new ArrayList<GeoPolygon>(); List<GeoPolygon> polygonsTarget = new ArrayList<GeoPolygon>(); for (GeoPolygon entity : polygonsPerKey) { if (entity.getType() == InputTypes.OSM_POLYGON) { polygonsSource.add(entity); } else { polygonsTarget.add(entity); } } List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>(); JaccardSimilarity jaccard = new JaccardSimilarity(); for (GeoPolygon entSource : polygonsSource) { for (GeoPolygon entTarget : polygonsTarget) { double linguisticSimilarity = 0.0; //calculate the linguistic similarity if (!entTarget.getGeoName().isEmpty()) { linguisticSimilarity = jaccard.getSimilarity( entTarget.getGeoName().toLowerCase(), entSource.getGeoName().toLowerCase()); } //calculate the polygon similarity double polygonSimilarity = entSource.getPolygonSimilarity(entTarget); //classification of pairs PolygonPair pair; if (linguisticSimilarity > thresholdLinguistic && polygonSimilarity > thresholdPolygon) { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.MATCH); } else if (linguisticSimilarity < thresholdLinguistic && polygonSimilarity < thresholdPolygon) { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.NON_MATCH); } else { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM); } // int index = entityMatches.size(); // entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair)); //for use case 04 if (pair.getPolygonClassification().equals(PolygonClassification.POSSIBLE_PROBLEM) || pair.getPolygonClassification().equals(PolygonClassification.MATCH)) { int index = entityMatches.size(); entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair)); } // if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) { // entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea()))); //// System.out.println(entTarget.getGeoName() + " - " + entSource.getGeoNameame(), _2)); //// System.out.println(entTarget.getGeoName() + " - " + ()); //// System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea())); //// System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea())); //// System.out.println(); // } } } return entityMatches.iterator(); } }); matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() { public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception { ArrayList<String> listOutput = new ArrayList<String>(); listOutput.add(t._2().toStringCSV()); return listOutput.iterator(); } }).saveAsTextFile(outputPath); ctx.stop(); ctx.close(); }
From source file:SingleMatchingGeoPolygon.SingleMatchingGeoPolygon.java
License:Apache License
public static void main(String[] args) throws Exception { // SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local"); SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); String dataSource = args[0];//www .j av a2 s . co m final double thresholdLinguistic = Double.parseDouble(args[1]); final double thresholdPolygon = Double.parseDouble(args[2]); String outputPath = args[3]; Integer amountPartition = Integer.parseInt(args[4]); String sourceType = args[5]; DataSource source1 = null; if (sourceType.equals("CSV")) { source1 = AbstractExec.getDataCSV(dataSource, ';'); } else { //is postgis source1 = AbstractExec.getDataPostGres(dataSource); } ReadAbstractSource reader = new ReadAbstractSource(); StorageManager storagePolygon = reader.readFile(source1); List<GeoPolygon> geoentities = new ArrayList<GeoPolygon>(); int index = 0; for (GenericObject genericObj : storagePolygon.getExtractedData()) { // System.out.println(genericObj.getData().get("geometry")); String nome = ""; Integer id; if (!genericObj.getData().get("name").toString().equals("null")) { nome = genericObj.getData().get("name").toString(); id = Integer.parseInt(genericObj.getData().get("id").toString()); geoentities.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome, InputTypes.OSM_POLYGON, index, id)); index++; } } JavaRDD<GeoPolygon> polygons = ctx.parallelize(geoentities); final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition); JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() { public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception { List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>(); GeoPolygon tocompare = s.getGeoPolygon(); tocompare.setDuplicated(false); listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>( tocompare.getIdGeometry() % numReplication.getValue(), tocompare));//entity that not replicated GeoPolygon duplicated = s.getGeoPolygon(); duplicated.setDuplicated(true); for (int i = 0; i < numReplication.value(); i++) {//the entities that will be replicated listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, duplicated)); } return listOfPolygonTuple.iterator(); } }); JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() { public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception { return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2()); } }); JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair( new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() { public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple) throws Exception { List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator()); List<GeoPolygon> polygonsToCompare = new ArrayList<GeoPolygon>(); List<GeoPolygon> polygonsDuplicated = new ArrayList<GeoPolygon>(); for (GeoPolygon entity : polygonsPerKey) { if (entity.isDuplicated()) { polygonsDuplicated.add(entity); } else { polygonsToCompare.add(entity); } } List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>(); JaccardSimilarity jaccard = new JaccardSimilarity(); for (GeoPolygon entSource : polygonsToCompare) { for (GeoPolygon entTarget : polygonsDuplicated) { double linguisticSimilarity = 0.0; //calculate the linguistic similarity if (!entTarget.getGeoName().isEmpty()) { linguisticSimilarity = jaccard.getSimilarity( entTarget.getGeoName().toLowerCase(), entSource.getGeoName().toLowerCase()); } //calculate the polygon similarity double polygonSimilarity = entSource.getPolygonSimilarity(entTarget); //classification of pairs PolygonPair pair; if (linguisticSimilarity > thresholdLinguistic && polygonSimilarity > thresholdPolygon) { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.MATCH); } else if (linguisticSimilarity < thresholdLinguistic && polygonSimilarity < thresholdPolygon) { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.NON_MATCH); } else { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM); } // int index = entityMatches.size(); // entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair)); //for use case 04 if (pair.getPolygonClassification().equals(PolygonClassification.MATCH) && (pair .getSource().getIdInDataset() != pair.getTarget().getIdInDataset())) { int index = entityMatches.size(); entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair)); } // if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) { // entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea()))); //// System.out.println(entTarget.getGeoName() + " - " + entSource.getGeoNameame(), _2)); //// System.out.println(entTarget.getGeoName() + " - " + ()); //// System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea())); //// System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea())); //// System.out.println(); // } } } return entityMatches.iterator(); } }); matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() { public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception { ArrayList<String> listOutput = new ArrayList<String>(); listOutput.add(t._2().toStringCSV()); return listOutput.iterator(); } }).saveAsTextFile(outputPath); ctx.stop(); ctx.close(); }
From source file:SingleMatchingGeoPolygon.SingleMatchingGeoPolygonBlocked.java
License:Apache License
public static void main(String[] args) throws Exception { // SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local"); SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); String dataSource = args[0];//from w w w . ja va 2 s .c o m final double thresholdLinguistic = Double.parseDouble(args[1]); final double thresholdPolygon = Double.parseDouble(args[2]); String outputPath = args[3]; Integer amountPartition = Integer.parseInt(args[4]); String sourceType = args[5]; DataSource source1 = null; if (sourceType.equals("CSV")) { source1 = AbstractExec.getDataCSV(dataSource, ';'); } else { //is postgis source1 = AbstractExec.getDataPostGres(dataSource); } ReadAbstractSource reader = new ReadAbstractSource(); StorageManager storagePolygon = reader.readFile(source1); // StorageManager storagePolygon = reader.readFile(AbstractExec.getDataPostGres("queries/osm_curitiba.txt")); // StorageManager storagePolygon = reader.readFile(AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt")); List<GeoPolygon> geoentities = new ArrayList<GeoPolygon>(); int index = 0; for (GenericObject genericObj : storagePolygon.getExtractedData()) { // System.out.println(genericObj.getData().get("geometry")); String nome = ""; Integer id; if (!genericObj.getData().get("name").toString().equals("null")) { nome = genericObj.getData().get("name").toString(); id = Integer.parseInt(genericObj.getData().get("id").toString()); geoentities.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome, InputTypes.OSM_POLYGON, index, id)); index++; } } JavaRDD<GeoPolygon> polygons = ctx.parallelize(geoentities); Broadcast<Integer> numReplication = ctx.broadcast(amountPartition); JavaRDD<Tuple2<String, GeoPolygon>> polygonLabed = polygons .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<String, GeoPolygon>>() { public Iterator<Tuple2<String, GeoPolygon>> call(GeoPolygon s) throws Exception { List<Tuple2<String, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<String, GeoPolygon>>(); GeoPolygon tocompare = s.getGeoPolygon(); tocompare.setDuplicated(false); if (tocompare.getGeoName().length() < 3) { listOfPolygonTuple .add(new Tuple2<String, GeoPolygon>(tocompare.getGeoName(), tocompare));//entity that not replicated } else { listOfPolygonTuple.add(new Tuple2<String, GeoPolygon>( tocompare.getGeoName().substring(0, 3), tocompare));//entity that not replicated } GeoPolygon duplicated = s.getGeoPolygon(); duplicated.setDuplicated(true); if (duplicated.getGeoName().length() < 3) { listOfPolygonTuple .add(new Tuple2<String, GeoPolygon>(duplicated.getGeoName(), duplicated)); } else { listOfPolygonTuple.add(new Tuple2<String, GeoPolygon>( duplicated.getGeoName().substring(0, 3), duplicated)); } // for (int i = 0; i < numReplication.value(); i++) {//the entities that will be replicated // listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(duplicated, duplicated)); // } return listOfPolygonTuple.iterator(); } }); JavaPairRDD<String, GeoPolygon> polygonsPaired = polygonLabed .mapToPair(new PairFunction<Tuple2<String, GeoPolygon>, String, GeoPolygon>() { public Tuple2<String, GeoPolygon> call(Tuple2<String, GeoPolygon> tuple) throws Exception { return new Tuple2<String, GeoPolygon>(tuple._1(), tuple._2()); } }); JavaPairRDD<String, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair( new PairFlatMapFunction<Tuple2<String, Iterable<GeoPolygon>>, Integer, PolygonPair>() { public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<String, Iterable<GeoPolygon>> tuple) throws Exception { List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator()); List<GeoPolygon> polygonsToCompare = new ArrayList<GeoPolygon>(); List<GeoPolygon> polygonsDuplicated = new ArrayList<GeoPolygon>(); for (GeoPolygon entity : polygonsPerKey) { if (entity.isDuplicated()) { polygonsDuplicated.add(entity); } else { polygonsToCompare.add(entity); } } List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>(); JaccardSimilarity jaccard = new JaccardSimilarity(); for (GeoPolygon entSource : polygonsToCompare) { for (GeoPolygon entTarget : polygonsDuplicated) { double linguisticSimilarity = 0.0; //calculate the linguistic similarity if (!entTarget.getGeoName().isEmpty()) { linguisticSimilarity = jaccard.getSimilarity( entTarget.getGeoName().toLowerCase(), entSource.getGeoName().toLowerCase()); } //calculate the polygon similarity double polygonSimilarity = entSource.getPolygonSimilarity(entTarget); //classification of pairs PolygonPair pair; if (linguisticSimilarity > thresholdLinguistic && polygonSimilarity > thresholdPolygon) { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.MATCH); } else if (linguisticSimilarity < thresholdLinguistic && polygonSimilarity < thresholdPolygon) { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.NON_MATCH); } else { pair = new PolygonPair(entSource, entTarget, linguisticSimilarity, polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM); } // int index = entityMatches.size(); // entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair)); //for use case 04 if (pair.getPolygonClassification().equals(PolygonClassification.MATCH) && (pair .getSource().getIdInDataset() != pair.getTarget().getIdInDataset())) { int index = entityMatches.size(); entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair)); } // if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) { // entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea()))); //// System.out.println(entTarget.getGeoName() + " - " + entSource.getGeoNameame(), _2)); //// System.out.println(entTarget.getGeoName() + " - " + ()); //// System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea())); //// System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea())); //// System.out.println(); // } } } return entityMatches.iterator(); } }); matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() { public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception { ArrayList<String> listOutput = new ArrayList<String>(); listOutput.add(t._2().toStringCSV()); return listOutput.iterator(); } }).saveAsTextFile(outputPath); ctx.stop(); ctx.close(); }
From source file:SparkExamples.SparkPR.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1);/* www . ja v a 2 s. c o m*/ } showWarning(); SparkConf sparkConf = new SparkConf().setAppName("SparkPR"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // ... long start = System.currentTimeMillis(); JavaRDD<String> lines = ctx.textFile(args[0], 1); int partition = Integer.parseInt(args[3]); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); parts[0] = parts[0].toString() .replaceAll("AAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZ", ""); parts[1] = parts[1].toString() .replaceAll("AAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZAAAAAAAAAZ", ""); return new Tuple2<String, String>(parts[0], parts[1]); } }).groupByKey(partition).persist(StorageLevel.MEMORY_AND_DISK()); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { @Override public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>(); for (String n : s._1) { results.add(new Tuple2<String, Double>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } ranks.foreach(p -> System.out.println(p)); long end = System.currentTimeMillis(); System.out.println("running time " + (end - start) / 1000 + "s"); String results = "running time " + (new Double((end - start) / 1000)).toString() + "s" + "input file : " + args[0] + ", iteration : " + args[1]; System.out.println(results); String outputurl = ".//results.txt"; BufferedWriter writer = new BufferedWriter(new FileWriter(outputurl, true)); writer.write(results); writer.newLine(); writer.close(); ctx.stop(); }
From source file:streaming.NginxlogSorter.java
License:Apache License
public static void main(String[] args) { JavaSparkContext sc = null;//from ww w. ja va 2s. c om try { SparkConf conf = new SparkConf().setMaster("local").setAppName("NginxlogSorter"); // conf.set("hadoop.home.dir", "/usr/local/hadoop/hadoop-2.6.0"); sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("src/test/resources/nginx_report.txt"); lines.map(new Function<String, String>() { @Override public String call(String s) throws Exception { log.info(s); return null; } }); JavaPairRDD<String, Integer> items = lines.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) throws Exception { log.info(s); return null; } }); lines.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() { @Override public Iterable<Tuple2<String, Integer>> call(String t) throws Exception { log.info(">>>: {}", t); return null; } }); } catch (Exception e) { e.printStackTrace(); } finally { sc.close(); } }
From source file:univ.bigdata.course.pagerank.JavaPageRank.java
License:Apache License
public static void calculatePageRank(JavaRDD<String> rdd, int iteration_number) throws Exception { // Loads in input file. It should be in format of: // URL neighbor URL // URL neighbor URL // URL neighbor URL // .../*w w w . j a va2s .co m*/ JavaRDD<String> lines = rdd; // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { String[] parts = SPACES.split(s); return new Tuple2<>(parts[0], parts[1]); } }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() { @Override public Double call(Iterable<String> rs) { return 1.0; } }); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < iteration_number; current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() { @Override public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) { int urlCount = Iterables.size(s._1); List<Tuple2<String, Double>> results = new ArrayList<>(); for (String n : s._1) { results.add(new Tuple2<>(n, s._2() / urlCount)); } return results; } }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() { @Override public Double call(Double sum) { return 0.15 + sum * 0.85; } }); } JavaPairRDD<Double, String> sortedRanks = ranks .mapToPair(new PairFunction<Tuple2<String, Double>, Double, String>() { @Override public Tuple2<Double, String> call(Tuple2<String, Double> t) { return new Tuple2<Double, String>(t._2, t._1); } }).sortByKey(false); // Collects all URL ranks and dump them to console. List<Tuple2<Double, String>> output = sortedRanks.takeOrdered(100, PageRankComperator.VALUE_COMP); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._2() + " has rank: " + tuple._1() + "."); } }
From source file:weka.distributed.spark.CorrelationMatrixSparkJob.java
License:Open Source License
/** * Build the correlation matrix and write it to the output destination * /*from w w w . j a v a 2 s . c o m*/ * @param dataset the input RDD dataset to use * @param headerWithSummary the header of the data (with summary attributes) * @param outputPath the path to write results to * @throws Exception if a problem occurs */ protected void buildMatrix(JavaRDD<Instance> dataset, final Instances headerWithSummary, String outputPath) throws Exception { String matrixMapOpts = getCorrelationMapTaskOptions(); String[] mapOpts = null; if (!DistributedJobConfig.isEmpty(matrixMapOpts)) { mapOpts = Utils.splitOptions(environmentSubstitute(matrixMapOpts)); } final String[] fMapOpts = mapOpts; // construct a temporary map task in order to determine how // many rows there will be in the matrix (after deleting any // nominal atts and potentially the class att) CorrelationMatrixMapTask tempTask = new CorrelationMatrixMapTask(); if (fMapOpts != null) { tempTask.setOptions(fMapOpts.clone()); } final boolean missingReplacedWithMean = !tempTask.getIgnoreMissingValues(); final boolean covarianceInsteadOfCorrelation = tempTask.getCovariance(); final boolean deleteClassIfSet = !tempTask.getKeepClassAttributeIfSet(); tempTask.setup(headerWithSummary); final int numRowsInMatrix = tempTask.getMatrix().length; JavaPairRDD<Integer, MatrixRowHolder> mapToPartialRows = dataset .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, MatrixRowHolder>() { /** For serialization */ private static final long serialVersionUID = -3024936415666668127L; protected List<Tuple2<Integer, MatrixRowHolder>> m_partialRows = new ArrayList<Tuple2<Integer, MatrixRowHolder>>(); @Override public Iterable<Tuple2<Integer, MatrixRowHolder>> call(Iterator<Instance> split) throws DistributedWekaException { CorrelationMatrixMapTask task = new CorrelationMatrixMapTask(); try { if (fMapOpts != null) { task.setOptions(fMapOpts); } task.setup(headerWithSummary); while (split.hasNext()) { task.processInstance(split.next()); } // output all the rows in this partial matrix double[][] partialMatrix = task.getMatrix(); int[][] coOcc = task.getCoOccurrenceCounts(); for (int i = 0; i < partialMatrix.length; i++) { double[] row = partialMatrix[i]; int[] co = null; if (coOcc != null) { co = coOcc[i]; } MatrixRowHolder rh = new MatrixRowHolder(i, row, co); m_partialRows.add(new Tuple2<Integer, MatrixRowHolder>(i, rh)); } } catch (Exception ex) { throw new DistributedWekaException(ex); } return m_partialRows; } }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRowsInMatrix)) .persist(getCachingStrategy().getStorageLevel()); JavaPairRDD<Integer, double[]> aggregatedRows = mapToPartialRows.mapPartitionsToPair( new PairFlatMapFunction<Iterator<Tuple2<Integer, MatrixRowHolder>>, Integer, double[]>() { /** For serialization */ private static final long serialVersionUID = -1290972198473290092L; protected List<Tuple2<Integer, double[]>> result = new ArrayList<Tuple2<Integer, double[]>>(); @Override public Iterable<Tuple2<Integer, double[]>> call( Iterator<Tuple2<Integer, MatrixRowHolder>> split) throws DistributedWekaException { List<double[]> partials = new ArrayList<double[]>(); List<int[]> partialCoOcc = new ArrayList<int[]>(); int rowNum = -1; while (split.hasNext()) { Tuple2<Integer, MatrixRowHolder> nextRow = split.next(); if (rowNum < 0) { rowNum = nextRow._2().getRowNumber(); } else { if (nextRow._2().getRowNumber() != rowNum) { throw new DistributedWekaException("Was not expecting the matrix row number " + "to change within a partition!"); } partials.add(nextRow._2().getRow()); if (!missingReplacedWithMean) { partialCoOcc.add(nextRow._2().getCoOccurrencesCounts()); } } } if (partials.size() > 0) { CorrelationMatrixRowReduceTask reducer = new CorrelationMatrixRowReduceTask(); double[] aggregated = reducer.aggregate(rowNum, partials, partialCoOcc, headerWithSummary, missingReplacedWithMean, covarianceInsteadOfCorrelation, deleteClassIfSet); result.add(new Tuple2<Integer, double[]>(rowNum, aggregated)); } return result; } }); List<Tuple2<Integer, double[]>> reducedRows = aggregatedRows.collect(); mapToPartialRows.unpersist(); double[][] m = new double[reducedRows.size()][reducedRows.size()]; for (Tuple2<Integer, double[]> row : reducedRows) { int i = row._1(); double[] js = row._2(); for (int j = 0; j < js.length; j++) { m[i][j] = js[j]; m[j][i] = js[j]; } } m_finalMatrix = new weka.core.matrix.Matrix(m); Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary); try { writeMatrixToOutput(outputPath, reducedRows, headerNoSummary, deleteClassIfSet); } catch (Exception ex) { logMessage(ex); throw new DistributedWekaException(ex); } if (getRunPCA()) { runPCA(outputPath, covarianceInsteadOfCorrelation, !deleteClassIfSet, headerWithSummary, headerNoSummary); } }
From source file:weka.distributed.spark.KMeansClustererSparkJob.java
License:Open Source License
/** * Perform an iteration of k-means/*from w w w. j a v a 2 s . co m*/ * * @param dataset the dataset to operate on * @param mapTasks the underlying map tasks to use - one for each separate run * of k-means that we're doing in parallel * @param converged array indicating which runs have converged * @param iterationNum the iteration number that we're up to * @param transformedHeaderNoSummary the header of the training data (sans * summary attributes) * @return a list of KMeansReduceTasks encapsulating the results of the * iteration for each active run of k-means * @throws DistributedWekaException if a problem occurs */ protected List<Tuple2<Integer, KMeansReduceTask>> performKMeansIteration(JavaRDD<Instance> dataset, final KMeansMapTask[] mapTasks, final boolean[] converged, final int iterationNum, final Instances transformedHeaderNoSummary) throws DistributedWekaException { final int numRuns = mapTasks.length; // keyed by run, a list of partial centroid summary instances // - one Instances object for each centroid (may be null if a // given centroid did not get any instances assigned to it) JavaPairRDD<Integer, List<Instances>> mapRuns = dataset .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, List<Instances>>() { /** * For serialization */ private static final long serialVersionUID = 6063661312796545915L; protected List<Tuple2<Integer, List<Instances>>> m_centroidStatsForRuns = new ArrayList<Tuple2<Integer, List<Instances>>>(); @Override public Iterable<Tuple2<Integer, List<Instances>>> call(Iterator<Instance> split) throws DistributedWekaException { while (split.hasNext()) { Instance current = split.next(); for (int k = 0; k < numRuns; k++) { if (!converged[k]) { mapTasks[k].processInstance(current); } } } for (int k = 0; k < numRuns; k++) { if (!converged[k]) { List<Instances> centroidStatsForRun = mapTasks[k].getCentroidStats(); m_centroidStatsForRuns .add(new Tuple2<Integer, List<Instances>>(k, centroidStatsForRun)); } } return m_centroidStatsForRuns; } }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns)) .persist(StorageLevel.MEMORY_AND_DISK()); mapRuns.count(); // Reduce. Need to aggregate all the cluster stats // for each run. Do we repartition into numRuns partitions and then // run another mapPartitions phase? With our custom partitioner this // should guarantee that a partition only contains the lists of instances // for one run. Can't use partitionByKey because CSVReduce is not // associative, and needs to see the whole list of summary instances // objects for one run, cluster# (need to run a separate reduce for // each cluster centroid within each run anyway). Then update the // final error for each centroid in each run and the total error // (sum of errors over centroids for a run) JavaPairRDD<Integer, KMeansReduceTask> reducedByRun = mapRuns.mapPartitionsToPair( new PairFlatMapFunction<Iterator<Tuple2<Integer, List<Instances>>>, Integer, KMeansReduceTask>() { /** * For serialization */ private static final long serialVersionUID = -747645603149767637L; protected List<Tuple2<Integer, KMeansReduceTask>> m_resultsForRun = new ArrayList<Tuple2<Integer, KMeansReduceTask>>(); @Override public Iterable<Tuple2<Integer, KMeansReduceTask>> call( Iterator<Tuple2<Integer, List<Instances>>> split) throws DistributedWekaException { List<List<Instances>> partialsForRun = new ArrayList<List<Instances>>(); int runNumber = -1; while (split.hasNext()) { Tuple2<Integer, List<Instances>> partial = split.next(); if (runNumber < 0) { runNumber = partial._1().intValue(); } else { if (partial._1().intValue() != runNumber) { throw new DistributedWekaException("[k-means] reduce phase: " + "was not expecting the run number to change within a " + "partition!"); } } partialsForRun.add(partial._2()); } KMeansReduceTask reducer = new KMeansReduceTask(); // size might be zero if we are operating on a partition for a // run that has already converged (in which case there will be no // data in this partition)... if (partialsForRun.size() > 0) { reducer.reduceClusters(runNumber, iterationNum, transformedHeaderNoSummary, partialsForRun); m_resultsForRun.add(new Tuple2<Integer, KMeansReduceTask>(runNumber, reducer)); } return m_resultsForRun; } }); List<Tuple2<Integer, KMeansReduceTask>> runResults = reducedByRun.collect(); mapRuns.unpersist(); reducedByRun.unpersist(); return runResults; }
From source file:weka.distributed.spark.KMeansClustererSparkJob.java
License:Open Source License
/** * Perform the k-means|| initialization process * * @param dataset the dataset to operate on * @param headerWithSummary the header of the data, with summary attributes * @param numRuns the number of separate runs of k-means to be performed in * parallel//from w w w . j a v a 2 s . c om * @param numClusters the number of clusters to generate * @return a list of Instances objects, where each Instances object contains * the starting points for one run of k-means * @throws IOException if a problem occurs * @throws DistributedWekaException if a problem occurs */ protected List<Instances> initializeWithKMeansParallel(JavaRDD<Instance> dataset, Instances headerWithSummary, final int numRuns, int numClusters) throws IOException, DistributedWekaException { int numSteps = Integer.parseInt(environmentSubstitute(getKMeansParallelInitSteps())); // random seed option int randomSeed = 1; if (!DistributedJobConfig.isEmpty(getRandomSeed())) { try { randomSeed = Integer.parseInt(environmentSubstitute(getRandomSeed())); } catch (NumberFormatException ex) { // don't fuss } } // 1) start with 1 randomly chosen point for each run // 2) run sketch for x iterations (aggregating reservoirs for each // run at the end of each iteration (i.e. reservoirs for run 1 // on each split of the data, reservoirs for run 2, etc.) // 3) Get final sketch for each run // 4) Weight each point in each sketch by the number of points // in the data that cluster to it // 5) Run local KMeans on data weighted data to obtain final k // starting centers // Step 1: start with 1 randomly chosen point for each run List<Instances> randomSingleCenters = initializeWithRandomCenters(dataset, headerWithSummary, numRuns, 1); // Step 2: run sketch for x iterations (aggregating reservoirs for each // run at the end of each iteration (i.e. reservoirs for run 1 // on each split of the data, reservoirs for run 2, etc.) Instances tmpTrans = null; // one configured task per run (we'll use this for an initial distance // function and for step 4 where we need to cluster all the points to // get cluster sizes final KMeansMapTask[] mapTasks = new KMeansMapTask[numRuns]; for (int i = 0; i < numRuns; i++) { mapTasks[i] = new KMeansMapTask(); try { mapTasks[i].setOptions(Utils.splitOptions(getKMeansMapTaskOpts())); } catch (Exception e) { throw new DistributedWekaException(e); } tmpTrans = mapTasks[i].init(headerWithSummary); } // transformed header (has passed through filters) final Instances transformedHeaderNoSummary = tmpTrans; NormalizableDistance distanceFunc = mapTasks[0].getDistanceFunction(); final CentroidSketch[] sketches = new CentroidSketch[numRuns]; // initialize sketches for (int i = 0; i < numRuns; i++) { try { // apply any filters Instances transformedStartSketch = randomSingleCenters.get(i); // mapTasks[0].applyFilters(randomSingleCenters.get(i)); sketches[i] = new CentroidSketch(transformedStartSketch, distanceFunc, 2 * numClusters, randomSeed + i); } catch (Exception ex) { logMessage(ex); throw new DistributedWekaException(ex); } } // this is used when processing instances in partitions to // ensure that each instance from the data set gets // filtered appropriately final KMeansMapTask forFilteringOnly = mapTasks[0]; for (int i = 0; i < numSteps; i++) { logMessage("[k-means] Running iteration " + (i + 1) + " of k-means|| initialization procedure."); final int iterationNum = i; // keyed by run, a list of partial sketches // - one CentroidSketch object for each run in each partition JavaPairRDD<Integer, CentroidSketch> mapRuns = dataset .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, CentroidSketch>() { /** * For serialization */ private static final long serialVersionUID = 6063661312796545915L; protected List<Tuple2<Integer, CentroidSketch>> m_centroidSketchesForRuns = new ArrayList<Tuple2<Integer, CentroidSketch>>(); @Override public Iterable<Tuple2<Integer, CentroidSketch>> call(Iterator<Instance> split) throws DistributedWekaException { while (split.hasNext()) { Instance current = split.next(); try { // make sure it goes through any filters first! current = forFilteringOnly.applyFilters(current); } catch (Exception ex) { throw new DistributedWekaException(ex); } for (int k = 0; k < numRuns; k++) { sketches[k].process(current, iterationNum == 0); } } for (int k = 0; k < numRuns; k++) { m_centroidSketchesForRuns.add(new Tuple2<Integer, CentroidSketch>(k, sketches[k])); } return m_centroidSketchesForRuns; } }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns)) .persist(StorageLevel.MEMORY_AND_DISK()); mapRuns.count(); // Each partion of mapRuns now contains partials for just one run. // Here we aggregate the partials per run JavaPairRDD<Integer, CentroidSketch> reducedByRun = mapRuns.mapPartitionsToPair( new PairFlatMapFunction<Iterator<Tuple2<Integer, CentroidSketch>>, Integer, CentroidSketch>() { /** For serialization */ private static final long serialVersionUID = 7689178383188695493L; protected List<Tuple2<Integer, CentroidSketch>> m_resultsForRun = new ArrayList<Tuple2<Integer, CentroidSketch>>(); @Override public Iterable<Tuple2<Integer, CentroidSketch>> call( Iterator<Tuple2<Integer, CentroidSketch>> split) throws DistributedWekaException { int runNumber = -1; CentroidSketch initial = null; List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>(); while (split.hasNext()) { Tuple2<Integer, CentroidSketch> partial = split.next(); if (runNumber < 0) { runNumber = partial._1().intValue(); } else { if (partial._1().intValue() != runNumber) { throw new DistributedWekaException("[k-means] k-means|| initialization: " + "was not expecting the run number to change within " + "a partition!"); } } if (initial == null) { initial = partial._2(); } else { try { initial.aggregateReservoir(partial._2().getReservoirSample()); } catch (Exception e) { throw new DistributedWekaException(e); } } // get all the distance functions and // compute priming data that has global // min and maxes. if (iterationNum == 0) { // only need to determine global distance function // priming data once (i.e. in the first iteration of // the k-means|| process) distsForRun.add(partial._2().getDistanceFunction()); } } // update the distance function with global numeric // attribute ranges if (distsForRun.size() > 0) { Instances distancePrimingData = KMeansReduceTask .computeDistancePrimingDataFromDistanceFunctions(distsForRun, transformedHeaderNoSummary); initial.getDistanceFunction().setInstances(distancePrimingData); } m_resultsForRun.add(new Tuple2<Integer, CentroidSketch>(runNumber, initial)); return m_resultsForRun; } }); List<Tuple2<Integer, CentroidSketch>> runResults = reducedByRun.collect(); mapRuns.unpersist(); mapRuns = null; for (Tuple2<Integer, CentroidSketch> r : runResults) { int runNum = r._1().intValue(); sketches[runNum] = r._2(); // add the current contents of the reservoir to the sketch // for each run try { sketches[runNum].addReservoirToCurrentSketch(); if (m_debug) { logMessage("[k-means] Iteration: " + i + " - number of instances in sketch: " + sketches[runNum].getCurrentSketch().numInstances() + "\n" + sketches[runNum].getCurrentSketch()); } } catch (Exception ex) { logMessage(ex); throw new DistributedWekaException(ex); } } reducedByRun.unpersist(); } // perform and aggregate clustering using the final sketch results // so that we can find out how many points are assigned to // each instance in the sketch. Instances globalPriming = sketches[0].getDistanceFunction().getInstances(); if (globalPriming.numInstances() != 2) { logMessage("[k-means] Error: as expecting a two instance " + "(global priming data) dataset to be set in the distance function " + "in each sketch!"); throw new DistributedWekaException("Was expecting a two instance (global priming data)" + " dataset to be set in the distance function in each sketch!"); } for (int i = 0; i < numRuns; i++) { // set sketches as centers for map tasks // in preparation for clustering (so that we can) // find out how many training points get assigned to // each center mapTasks[i].setCentroids(sketches[i].getCurrentSketch()); mapTasks[i].setDummyDistancePrimingData(globalPriming); } // 3 & 4) Get final sketch for each run and weight each point in // the sketch by the number of training instances that cluster to it List<Tuple2<Integer, KMeansReduceTask>> clusterAssignments = performKMeansIteration(dataset, mapTasks, new boolean[numRuns], 1, transformedHeaderNoSummary); List<Instances> finalStartPointsForRuns = new ArrayList<Instances>(); for (int i = 0; i < numRuns; i++) { int rN = clusterAssignments.get(i)._1().intValue(); List<Instances> centroidSummaries = clusterAssignments.get(i)._2().getAggregatedCentroidSummaries(); Instances sketchForRun = sketches[i].getCurrentSketch(); // empty clusters shouldn't be a problem - in // one iteration each sketch member should at minimum // have itself assigned (i.e. count >= 1). NOTE: The only exception // could occur if the sketch contains duplicate instances. However, // this shouldn't happen within a single WeightedReservoirSampling // as candidate instances with weight 0 (i.e. distance 0 to the sketch // in this case) are never added to the sketch. if (centroidSummaries.size() != sketchForRun.numInstances()) { logMessage("[k-means] Error: was expecting as " + "many summary headers as \n" + "there are center candidates in the sketch for run " + rN); throw new DistributedWekaException("Was expecting as many summary headers as " + "there are center candidates in the sketch for run " + rN); } for (int j = 0; j < sketchForRun.numInstances(); j++) { Instance centerCandidate = sketchForRun.instance(j); Instances centerStats = centroidSummaries.get(j); double weightForCandidate = -1.0; // now grab the first summary attribute and get count for (int k = 0; k < sketchForRun.numAttributes(); k++) { if (sketchForRun.attribute(k).isNumeric()) { Attribute statsAtt = centerStats .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + sketchForRun.attribute(k).name()); weightForCandidate = ArffSummaryNumericMetric.COUNT.valueFromAttribute(statsAtt) + ArffSummaryNumericMetric.MISSING.valueFromAttribute(statsAtt); break; } else if (sketchForRun.attribute(k).isNominal()) { Attribute statsAtt = centerStats .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + sketchForRun.attribute(k).name()); NominalStats ns = NominalStats.attributeToStats(statsAtt); weightForCandidate = 0; for (String s : ns.getLabels()) { weightForCandidate += ns.getCount(s); } weightForCandidate += ns.getNumMissing(); } } if (weightForCandidate < 0) { logMessage("[k-means] Error: unable to compute the " + "number of training instances " + "assigned to sketch member " + j + " in run " + i); throw new DistributedWekaException("Unable to compute the number of training instances " + "assigned to sketch member " + j + " in run " + i); } // finally - set the weight centerCandidate.setWeight(weightForCandidate); } if (m_debug) { logMessage("Final weighted sketch (run " + i + ") prior to local KMeans:\n" + sketchForRun); } // now run standard k-means on the weighted sketch to // (hopefully) get the requested number of start points SimpleKMeans localKMeans = new SimpleKMeans(); try { localKMeans.setNumClusters(numClusters); localKMeans.setInitializationMethod( new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION)); localKMeans.buildClusterer(sketchForRun); finalStartPointsForRuns.add(localKMeans.getClusterCentroids()); } catch (Exception ex) { logMessage(ex); throw new DistributedWekaException(ex); } } m_distanceFunctionPrimingData = globalPriming; return finalStartPointsForRuns; }