opennlp.addons.geoentitylinker.indexing.USGSProcessor.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.addons.geoentitylinker.indexing.USGSProcessor.java

Source

/*
 * Copyright 2014 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.addons.geoentitylinker.indexing;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.addons.geoentitylinker.AdminBoundary;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

public class USGSProcessor {

    public static void main(String[] args) {
        try {
            Map<String, AdminBoundary> provData = getProvData(
                    new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
                    GazetteerIndexer.GazType.USGS);
            process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
                    new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);
        } catch (Exception ex) {
            Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w)
            throws Exception {
        Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS);
        readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
        writeCountryContextFile(outputCountryContextfile, provData);
    }

    public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type,
            Map<String, AdminBoundary> lookupMap) throws Exception {

        Map<String, StateCentroid> states = new HashMap<>();
        BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
        List<String> fields = new ArrayList<>();
        int counter = 0;
        System.out.println("reading gazetteer data from USGS file...........");
        String line = "";
        while ((line = reader.readLine()) != null) {

            String[] values = line.split(type.getSeparator());
            if (counter == 0) {
                for (String columnName : values) {
                    fields.add(columnName.replace("", "").trim());
                }

            } else {
                Document doc = new Document();
                for (int i = 0; i < fields.size() - 1; i++) {
                    doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
                }
                String placeName = values[1];
                String lat = values[9];
                String lon = values[10];
                String dsg = values[2];
                String id = values[0];

                String ccode = values[6];
                String admincode = values[3];
                AdminBoundary get = lookupMap.get(admincode + "." + ccode);
                String countyname = "";
                if (get == null) {
                    System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: "
                            + admincode + " , placename: " + placeName);
                    continue;

                }
                String countyCode = get.getCountyCode();

                if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
                    countyname = get.getCountyName();
                }
                if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
                    countyCode = get.getCountyCode();
                }
                String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", "
                        + placeName;

                if (states.containsKey(get.getProvinceName())) {
                    StateCentroid entry = states.get(get.getProvinceName());
                    entry.count++;
                    entry.latSum += Double.valueOf(lat);
                    entry.longSum += Double.valueOf(lon);
                } else {
                    StateCentroid centroid = new StateCentroid();
                    centroid.statecode = get.getProvCode();
                    centroid.count = 1;
                    centroid.latSum = Double.valueOf(lat);
                    centroid.longSum = Double.valueOf(lon);
                    states.put(get.getProvinceName(), centroid);
                }

                doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
                doc.add(new TextField("placename", placeName, Field.Store.YES));
                doc.add(new TextField("latitude", lat, Field.Store.YES));
                doc.add(new TextField("longitude", lon, Field.Store.YES));
                doc.add(new StringField("loctype", dsg, Field.Store.YES));
                doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(),
                        Field.Store.YES));
                doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));
                doc.add(new StringField("countycode",
                        (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(),
                        Field.Store.YES));

                doc.add(new StringField("locid", id, Field.Store.YES));
                doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
                w.addDocument(doc);
            }
            counter++;
            if (counter % 100000 == 0) {
                w.commit();
                System.out.println(counter + " .........USGS entries committed to index..............");
            }

        }

        for (String state : states.keySet()) {
            StateCentroid get = states.get(state);
            Document doc = new Document();
            doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));
            doc.add(new TextField("placename", state, Field.Store.YES));
            //calculate a centroid for all the points that were in the state
            doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));
            doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));
            doc.add(new StringField("loctype", "adm1", Field.Store.YES));
            doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
            doc.add(new StringField("countrycode", "us", Field.Store.YES));
            doc.add(new StringField("countycode", "", Field.Store.YES));

            doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
            doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
            w.addDocument(doc);

            // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));
        }
        Document doc = new Document();
        doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
        doc.add(new TextField("placename", "united states", Field.Store.YES));
        //calculate a centroid for all the points that were in the state
        doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
        doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
        doc.add(new StringField("loctype", "pcli", Field.Store.YES));
        doc.add(new StringField("admincode", "", Field.Store.YES));
        doc.add(new StringField("countrycode", "us", Field.Store.YES));
        doc.add(new StringField("countycode", "", Field.Store.YES));

        doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));
        doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
        //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));

        w.addDocument(doc);
        w.commit();

        System.out.println("Completed indexing USGS gaz!");
    }

    private static class StateCentroid {

        double latSum;
        double longSum;
        String statecode;
        int count;
    }

    private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {
        System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
        Map<String, AdminBoundary> outmap = new HashMap<>();
        BufferedReader reader;

        try {

            reader = new BufferedReader(new FileReader(govUnitsFile));
            int i = 0;
            String line = "";
            String[] fields = null;
            while ((line = reader.readLine()) != null) {

                String[] values = line.split(type.getSeparator());
                if (i == 0) {
                    fields = values;
                    i++;
                    continue;
                }
                i++;
                // System.out.println(i);
                String countyCode = values[2];
                String countyName = values[3];
                String stateCode = values[5];
                String stateName = values[6];
                String countryCode = values[7];
                String countryName = values[8];
                AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName,
                        countyCode, countyName, null, null, null);
                outmap.put(stateCode + "." + countyCode, adminBoundary);
                //  System.out.println(adminBoundary);

            }
            reader.close();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());

        return outmap;

    }

    public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
        // FileWriter writer = null;
        try (FileWriter writer = new FileWriter(outfile, true)) {

            for (String admkey : adms.keySet()) {
                AdminBoundary adm = adms.get(admkey);
                if (adm == null) {
                    continue;
                }
                String province = adm.getProvinceName();
                String country = adm.getCountryName();
                /**
                 * this is the standard format of the country context file... Geonames
                 * data will have an empty string for the county
                 */
                String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t"
                        + country + "\t" + province + "\t" + adm.getCountyName() + "\t"
                        + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t"
                        + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
                writer.write(line);
                ///  System.out.println(line);

            }
        } catch (IOException ex) {
            Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);
        }
        System.out.println("successfully wrote USGS entries to country context file");
    }
}