uk.ac.susx.tag.method51.twitter.geocoding.geonames.GeonamesSPARQLLocationDatabase.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.susx.tag.method51.twitter.geocoding.geonames.GeonamesSPARQLLocationDatabase.java

Source

package uk.ac.susx.tag.method51.twitter.geocoding.geonames;

/*
 * #%L
 * GeonamesSPARQLLocationDatabase.java - method51 - University of Sussex - 2,013
 * %%
 * Copyright (C) 2013 - 2014 University of Sussex
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.google.common.io.Resources;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.susx.mlcl.lib.io.Files;
import uk.ac.susx.tag.method51.core.MiscUtil;
import uk.ac.susx.tag.method51.twitter.geocoding.*;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.*;

/**
 * User: sw206
 * Date: 03/01/2013
 * Time: 14:53
 */
public class GeonamesSPARQLLocationDatabase implements LocationDatabase {

    private static final Logger LOG = LoggerFactory.getLogger(GeonamesSPARQLLocationDatabase.class);

    private final String sparqlQuery;

    private static final String POPULATION_KEY = "population";
    private static final String FEATURE_CODE_KEY = "code";
    private static final String LATITUDE_KEY = "lat";
    private static final String LONGITUDE_KEY = "lon";

    private static final String CODE_PREFIX = "http://www.geonames.org/ontology#";

    private final Map<String, Integer> featureCodeScores;

    private final double populationScore;

    private final int populationThreshold;

    private final double defaultScore;

    private final double distanceTolerance; //km

    private final String host;
    private final int port;

    public GeonamesSPARQLLocationDatabase() throws IOException {
        this("127.0.0.1", 3030);
    }

    public GeonamesSPARQLLocationDatabase(String host, int port) throws IOException {

        this.host = host;
        this.port = port;

        try (BufferedReader reader = new BufferedReader(new InputStreamReader(
                Resources.getResource(this.getClass(), "location_lookup.rq").openStream()));) {
            sparqlQuery = Files.getText(reader).toString();
        }

        featureCodeScores = new HashMap<>();

        featureCodeScores.put(CODE_PREFIX + "P.PPLL", 3);
        featureCodeScores.put(CODE_PREFIX + "P.PPL", 4);
        featureCodeScores.put(CODE_PREFIX + "P.PPLX", 5);
        featureCodeScores.put(CODE_PREFIX + "P.PPLA4", 6);
        featureCodeScores.put(CODE_PREFIX + "P.PPLA3", 7);
        featureCodeScores.put(CODE_PREFIX + "P.PPLA2", 8);
        featureCodeScores.put(CODE_PREFIX + "P.PPLA", 9);
        featureCodeScores.put(CODE_PREFIX + "P.PPLC", 10);

        featureCodeScores.put(CODE_PREFIX + "A.ADM1", 10);
        featureCodeScores.put(CODE_PREFIX + "A.ADM2", 10);
        featureCodeScores.put(CODE_PREFIX + "A.ADM3", 10);
        featureCodeScores.put(CODE_PREFIX + "A.ADM4", 10);
        featureCodeScores.put(CODE_PREFIX + "A.ADM5", 10);

        populationScore = 25;

        double gbScore = 0;

        defaultScore = 3;

        populationThreshold = 2000000;

        distanceTolerance = 20;
    }

    /**
     * Performs a request to the SPARQL server and parses the response.
     *
     * @param candidate
     * @return list of matching entries
     * @throws java.io.IOException
     */
    @Override
    public List<LocationMatch> query(LocationCandidate candidate) throws IOException {

        String query = String.format(sparqlQuery, candidate.getCandidateString(), populationThreshold);

        List<LocationMatch> matches = null;

        try {

            String response = makeRequest(query);

            //System.out.println(response);

            matches = parseResponse(response, candidate);

            collapseCloseMatches(matches);

            if (matches.size() > 1) {

                matches = rankMatches(matches);
            }
        } catch (URISyntaxException | ParseException | LocationUnresolvedException e) {
            LOG.error(e.getMessage(), e);
        }

        return matches;
    }

    public String makeRequest(String query) throws IOException, URISyntaxException {

        HttpGet get = new HttpGet();

        String queryString = "query=" + URLEncoder.encode(query, "UTF-8");

        queryString += "&output=json";

        String locationLookupURI = "/ds/query";
        int locationLookupPort = port;
        String locationLookupHost = host;
        URI uri = URIUtils.createURI("http", locationLookupHost, locationLookupPort, locationLookupURI, queryString,
                null);

        get.setURI(uri);

        HttpClient httpClient = new DefaultHttpClient();

        ResponseHandler<String> responseHandler = new BasicResponseHandler();

        String responseBody = httpClient.execute(get, responseHandler);

        httpClient.getConnectionManager().shutdown();

        return responseBody;
    }

    private List<LocationMatch> parseResponse(String response, LocationCandidate candidate)
            throws ParseException, LocationUnresolvedException {

        JSONParser jsonParser = new JSONParser();

        List<LocationMatch> matches = new LinkedList<>();

        JSONObject root = (JSONObject) jsonParser.parse(response);

        JSONArray bindings = (JSONArray) ((Map) root.get("results")).get("bindings");

        for (Object object1 : bindings) {
            JSONObject binding = (JSONObject) object1;

            Map<String, String> data = new HashMap<>();

            for (Object object2 : binding.entrySet()) {
                Map.Entry field = (Map.Entry) object2;

                String key = (String) field.getKey();
                String value = (String) ((Map) field.getValue()).get("value");

                data.put(key, value);
            }

            LocationMatch match = new LocationMatch();

            match.data = data;
            if (candidate != null) {
                match.matchingText = candidate.getCandidateString();
                match.matchBegin = candidate.getBegin();
                match.matchEnd = candidate.getEnd();
            }

            String lat = data.get(LATITUDE_KEY);
            String lon = data.get(LONGITUDE_KEY);

            if (lat == null || lon == null) {
                throw new LocationUnresolvedException("null lat or lon in geonames db entry " + data.get("x"));
            }

            try {
                match.lat = Double.parseDouble(lat);
                match.lon = Double.parseDouble(lon);
            } catch (NumberFormatException e) {
                throw new LocationUnresolvedException("could not parse lat / lon to numbers " + lat + " / " + lon);
            }

            matches.add(match);

            //System.out.println((String)entry.get("x"));
        }

        return matches;
    }

    private void collapseCloseMatches(List<LocationMatch> match) {

        int i = 0;
        while (i < match.size()) {
            LocationMatch mI = match.get(i);
            int j = i + 1;
            while (j < match.size()) {
                LocationMatch mJ = match.get(j);

                if (euclideanDistance(mI, mJ) < distanceTolerance) {
                    match.remove((mJ));
                }

                ++j;
            }
            ++i;
        }
    }

    /**
     * Scores matches for the same mention by featureCode and population attributes
     *
     * @param matches
     */
    public List<LocationMatch> rankMatches(List<LocationMatch> matches) {

        double[] scores = new double[matches.size()];
        Arrays.fill(scores, 0.0);

        int i = 0;
        int totalPop = 0;

        for (LocationMatch match : matches) {

            try {

                scores[i] = featureCodeScores.get(match.data.get(FEATURE_CODE_KEY));
            } catch (NullPointerException e) {

                scores[i] = defaultScore;
            }

            if (match.data.containsKey(POPULATION_KEY)) {

                try {

                    totalPop += Integer.parseInt(match.data.get(POPULATION_KEY));
                } catch (NumberFormatException e) {
                    LOG.error("Exception caught", e);
                }
            }

            ++i;
        }

        i = 0;
        for (LocationMatch match : matches) {

            if (match.data.containsKey(POPULATION_KEY)) {
                String pop = match.data.get(POPULATION_KEY);

                try {
                    int population = Integer.parseInt(pop);
                    double score = populationScore * (population / (double) totalPop);

                    scores[i] += score;
                } catch (NumberFormatException e) {
                    LOG.error("POPULATION: Couldn't parse " + pop + " to a number!", e);
                }
            }

            ++i;
        }

        //System.out.println(Arrays.toString(scores));
        //normalise
        double total = 0;
        for (double score : scores) {
            total += score;
        }

        double[] likelihoods = new double[scores.length];
        for (i = 0; i < scores.length; ++i) {
            likelihoods[i] = scores[i] / total;
        }

        i = 0;
        for (LocationMatch match : matches) {
            match.likelihood = likelihoods[i];
            match.score = scores[i];
            ++i;
        }

        Collections.sort(matches, new LocationMatchLikelihoodComparator());
        if (matches.size() > 30) {
            matches = matches.subList(0, 30);
            rankMatches(matches);
        }
        return matches;
    }

    public double euclideanDistance(LocationMatch m1, LocationMatch m2) {

        double[] p1 = new double[2];

        p1[0] = m1.lat;
        p1[1] = m1.lon;

        double[] p2 = new double[2];

        p2[0] = m2.lat;
        p2[1] = m2.lon;

        return euclideanDistance(p1, p2);
    }

    public static double euclideanDistance(double[] p1, double[] p2) {

        double EARTH_RADIUS = 6371;

        double lat1 = p1[0];
        double lon1 = p1[1];

        double lat2 = p2[0];
        double lon2 = p2[1];

        double dLat = Math.toRadians(lat2 - lat1);
        double dLng = Math.toRadians(lon2 - lon1);
        double a = Math.sin(dLat / 2) * Math.sin(dLat / 2) + Math.cos(Math.toRadians(lat1))
                * Math.cos(Math.toRadians(lat2)) * Math.sin(dLng / 2) * Math.sin(dLng / 2);
        double c = 2 * Math.atan2(Math.sqrt(a), Math.sqrt(1 - a));

        double x = (lon2 - lon1) * Math.cos((lat1 + lat2) / 2);

        double y = (lat2 - lat1);

        //double dist = Math.sqrt( x*x + y*y ) * EARTH_RADIUS;

        double dist = c * EARTH_RADIUS;
        return dist;
    }
}