PolygonMatching.MatchingGeoPolygon.java Source code

Introduction

Here is the source code for PolygonMatching.MatchingGeoPolygon.java
Source

package PolygonMatching;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.collections.IteratorUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;

import PolygonDependencies.GeoPolygon;
import PolygonDependencies.InputTypes;
import PolygonDependencies.PolygonClassification;
import PolygonDependencies.PolygonPair;
import genericEntity.datasource.DataSource;
import genericEntity.exec.AbstractExec;
import genericEntity.util.data.GenericObject;
import genericEntity.util.data.storage.StorageManager;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import scala.Tuple2;
import uk.ac.shef.wit.simmetrics.similaritymetrics.JaccardSimilarity;

public final class MatchingGeoPolygon {

    private static final Pattern SPACE = Pattern.compile(" ");
    private static final int rangeBlockingKey = 7;

    public static void main(String[] args) throws Exception {
        //      SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark").setMaster("local");
        SparkConf sparkConf = new SparkConf().setAppName("GeoMatchingSpark");
        JavaSparkContext ctx = new JavaSparkContext(sparkConf);

        String dataSource1 = args[0];
        String dataSource2 = args[1];
        final double thresholdLinguistic = Double.parseDouble(args[2]);
        final double thresholdPolygon = Double.parseDouble(args[3]);
        String outputPath = args[4];
        Integer amountPartition = Integer.parseInt(args[5]);
        String sourceType = args[6];

        DataSource dataSourcePref = null;
        DataSource dataSourceOSM = null;
        if (sourceType.equals("CSV")) {
            dataSourcePref = AbstractExec.getDataCSV(dataSource1, ';');
            dataSourceOSM = AbstractExec.getDataCSV(dataSource2, ';');
        } else { //is postgis
            dataSourcePref = AbstractExec.getDataPostGres(dataSource1);
            dataSourceOSM = AbstractExec.getDataPostGres(dataSource2);
        }

        //      DataSource dataSourcePref = AbstractExec.getDataPostGres(dataSource1); //squaresOfCuritiba Pref
        //      DataSource dataSourceOSM = AbstractExec.getDataPostGres(dataSource2); //squaresOfCuritiba OSM

        //      DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/squares_pref_curitiba.txt"); //squaresOfCuritiba Pref
        //      DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_curitiba.txt"); //squaresOfCuritiba OSM

        //      DataSource dataSourcePref = AbstractExec.getDataPostGres("queries/parks_pref_ny.txt"); //parksOfNY Pref
        //      DataSource dataSourceOSM = AbstractExec.getDataPostGres("queries/osm_ny.txt"); //parksOfNY OSM

        StorageManager storagePref = new StorageManager();
        StorageManager storageOSM = new StorageManager();

        // enables in-memory execution for faster processing
        // this can be done since the whole data fits into memory
        storagePref.enableInMemoryProcessing();
        storageOSM.enableInMemoryProcessing();

        // adds the "data" to the algorithm
        storagePref.addDataSource(dataSourcePref);
        storageOSM.addDataSource(dataSourceOSM);

        if (!storagePref.isDataExtracted()) {
            storagePref.extractData();
        }
        if (!storageOSM.isDataExtracted()) {
            storageOSM.extractData();
        }

        List<GeoPolygon> geoentitiesPref = new ArrayList<GeoPolygon>();
        List<GeoPolygon> geoentitiesOSM = new ArrayList<GeoPolygon>();

        // the algorithm returns each generated pair step-by-step
        int indexOfPref = 0;
        for (GenericObject genericObj : storagePref.getExtractedData()) {
            String nome = "";
            Integer id;
            if (!genericObj.getData().get("name").toString().equals("null")) {//for curitiba use atribute "nome" for new york "signname"
                nome = genericObj.getData().get("name").toString();
                id = Integer.parseInt(genericObj.getData().get("id").toString());//for curitiba use atribute "gid" for new york "id"
                geoentitiesPref.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                        InputTypes.GOV_POLYGON, indexOfPref, id));
                indexOfPref++;
            }

        }

        int indexOfOSM = 0;
        for (GenericObject genericObj : storageOSM.getExtractedData()) {
            //               System.out.println(genericObj.getData().get("geometry"));
            String nome = "";
            Integer id;
            if (!genericObj.getData().get("name").toString().equals("null")) {
                nome = genericObj.getData().get("name").toString();
                id = Integer.parseInt(genericObj.getData().get("id").toString());
                geoentitiesOSM.add(new GeoPolygon(genericObj.getData().get("geometry").toString(), nome,
                        InputTypes.OSM_POLYGON, indexOfOSM, id));
                indexOfOSM++;
            }

        }

        JavaRDD<GeoPolygon> polygonsOSM = ctx.parallelize(geoentitiesOSM);
        JavaRDD<GeoPolygon> polygonsPref = ctx.parallelize(geoentitiesPref);

        JavaRDD<GeoPolygon> polygons = polygonsPref.union(polygonsOSM);

        final Broadcast<Integer> numReplication = ctx.broadcast(amountPartition);
        JavaRDD<Tuple2<Integer, GeoPolygon>> polygonLabed = polygons
                .flatMap(new FlatMapFunction<GeoPolygon, Tuple2<Integer, GeoPolygon>>() {

                    public Iterator<Tuple2<Integer, GeoPolygon>> call(GeoPolygon s) throws Exception {
                        List<Tuple2<Integer, GeoPolygon>> listOfPolygonTuple = new ArrayList<Tuple2<Integer, GeoPolygon>>();
                        if (s.getType().equals(InputTypes.OSM_POLYGON)) {
                            listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(
                                    s.getIdGeometry() % numReplication.getValue(), s));
                            return listOfPolygonTuple.iterator();
                        } else { //equals to InputTypes.GOV_POLYGON
                            for (int i = 0; i < numReplication.value(); i++) {
                                listOfPolygonTuple.add(new Tuple2<Integer, GeoPolygon>(i, s));
                            }
                            return listOfPolygonTuple.iterator();
                        }
                    }

                });

        JavaPairRDD<Integer, GeoPolygon> polygonsPaired = polygonLabed
                .mapToPair(new PairFunction<Tuple2<Integer, GeoPolygon>, Integer, GeoPolygon>() {

                    public Tuple2<Integer, GeoPolygon> call(Tuple2<Integer, GeoPolygon> tuple) throws Exception {
                        return new Tuple2<Integer, GeoPolygon>(tuple._1(), tuple._2());
                    }
                });

        JavaPairRDD<Integer, Iterable<GeoPolygon>> polygonsGrouped = polygonsPaired.groupByKey(amountPartition);//number of partitions

        JavaPairRDD<Integer, PolygonPair> matches = polygonsGrouped.flatMapToPair(
                new PairFlatMapFunction<Tuple2<Integer, Iterable<GeoPolygon>>, Integer, PolygonPair>() {

                    public Iterator<Tuple2<Integer, PolygonPair>> call(Tuple2<Integer, Iterable<GeoPolygon>> tuple)
                            throws Exception {
                        List<GeoPolygon> polygonsPerKey = IteratorUtils.toList(tuple._2().iterator());
                        List<GeoPolygon> polygonsSource = new ArrayList<GeoPolygon>();
                        List<GeoPolygon> polygonsTarget = new ArrayList<GeoPolygon>();
                        for (GeoPolygon entity : polygonsPerKey) {
                            if (entity.getType() == InputTypes.OSM_POLYGON) {
                                polygonsSource.add(entity);
                            } else {
                                polygonsTarget.add(entity);
                            }
                        }

                        List<Tuple2<Integer, PolygonPair>> entityMatches = new ArrayList<Tuple2<Integer, PolygonPair>>();
                        JaccardSimilarity jaccard = new JaccardSimilarity();
                        for (GeoPolygon entSource : polygonsSource) {
                            for (GeoPolygon entTarget : polygonsTarget) {
                                double linguisticSimilarity = 0.0;
                                //calculate the linguistic similarity
                                if (!entTarget.getGeoName().isEmpty()) {
                                    linguisticSimilarity = jaccard.getSimilarity(
                                            entTarget.getGeoName().toLowerCase(),
                                            entSource.getGeoName().toLowerCase());
                                }

                                //calculate the polygon similarity
                                double polygonSimilarity = entSource.getPolygonSimilarity(entTarget);

                                //classification of pairs
                                PolygonPair pair;
                                if (linguisticSimilarity > thresholdLinguistic
                                        && polygonSimilarity > thresholdPolygon) {
                                    pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                            polygonSimilarity, PolygonClassification.MATCH);
                                } else if (linguisticSimilarity < thresholdLinguistic
                                        && polygonSimilarity < thresholdPolygon) {
                                    pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                            polygonSimilarity, PolygonClassification.NON_MATCH);
                                } else {
                                    pair = new PolygonPair(entSource, entTarget, linguisticSimilarity,
                                            polygonSimilarity, PolygonClassification.POSSIBLE_PROBLEM);
                                }

                                //                  int index = entityMatches.size();
                                //                  entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));

                                //for use case 04
                                if (pair.getPolygonClassification().equals(PolygonClassification.POSSIBLE_PROBLEM)
                                        || pair.getPolygonClassification().equals(PolygonClassification.MATCH)) {
                                    int index = entityMatches.size();
                                    entityMatches.add(new Tuple2<Integer, PolygonPair>(index, pair));
                                }

                                //                     if (Math.abs(entTarget.getArea() - entSource.getArea()) > thresholdArea) {
                                //                        entityMatches.add(new Tuple2<String, String>(entTarget.getGeoName(), entSource.getGeoName() + ":" + Math.abs(entTarget.getArea() - entSource.getArea())));
                                ////                        System.out.println(entTarget.getGeoName() +  " - " + entSource.getGeoNameame(), _2));
                                ////                        System.out.println(entTarget.getGeoName() +  " - " + ());
                                ////                        System.out.println(entTarget.getGeoName() + " pref: " + String.format("%.2f", entTarget.getArea()));
                                ////                        System.out.println(entSource.getGeoName() + " OSM: " + String.format("%.2f", entSource.getArea()));
                                ////                        System.out.println();
                                //                     }
                            }
                        }
                        return entityMatches.iterator();
                    }
                });

        matches.flatMap(new FlatMapFunction<Tuple2<Integer, PolygonPair>, String>() {

            public Iterator<String> call(Tuple2<Integer, PolygonPair> t) throws Exception {
                ArrayList<String> listOutput = new ArrayList<String>();
                listOutput.add(t._2().toStringCSV());
                return listOutput.iterator();
            }

        }).saveAsTextFile(outputPath);

        ctx.stop();
        ctx.close();
    }

}