opennlp.addons.geoentitylinker.indexing.GazetteerIndexer.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.addons.geoentitylinker.indexing.GazetteerIndexer.java

Source

/*
 * Copyright 2013 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.addons.geoentitylinker.indexing;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;

/**
 * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker.
 */
public class GazetteerIndexer {

    public static void main(String[] args) {

        if (args.length != 8) {
            System.out.println("Usage: GazetteerIndexer geonamesData geoNamesCountryInfo geonamesAdmin1CodesASCII "
                    + "usgsDataFile usgsGovUnitsFile outputIndexDir outputCountryContextFile regionsFile");
            System.out.println();
            System.out
                    .println("The GazetteerIndexer.index methods javadoc explains how to retrieve the data files.");
            return;
        }

        File geonamesData = new File(args[0]);
        File geoNamesCountryInfo = new File(args[1]);
        File geonamesAdmin1CodesASCII = new File(args[2]);
        File usgsDataFile = new File(args[3]);
        File usgsGovUnitsFile = new File(args[4]);
        File outputIndexDir = new File(args[5]);
        File outputCountryContextFile = new File(args[6]);
        File regionsFile = new File(args[7]);

        try {
            GazetteerIndexer i = new GazetteerIndexer();
            i.index(geonamesData, geoNamesCountryInfo, geonamesAdmin1CodesASCII, usgsDataFile, usgsGovUnitsFile,
                    outputIndexDir, outputCountryContextFile, regionsFile);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    public GazetteerIndexer() {

    }

    public static interface Separable {

        String getSeparator();
    }

    public enum GazType implements Separable {

        GEONAMES {
            @Override
            public String toString() {
                return "/opennlp_geoentitylinker_geonames_idx";
            }

            @Override
            public String getSeparator() {
                return "\t";
            }
        },
        USGS {
            @Override
            public String toString() {
                return "/opennlp_geoentitylinker_usgsgaz_idx";
            }

            @Override
            public String getSeparator() {
                return "\\|";
            }
        }
    }

    /**
     *
     * @param geonamesData the actual Geonames gazetteer data downloaded from
     * here: http://download.geonames.org/export/dump/ then click on this link
     * 'allCountries.zip'
     * @param geoNamesCountryInfo the countryinfo lookup table that can be
     * downloaded from here
     * http://download.geonames.org/export/dump/countryInfo.txt You'll need to
     * copy the page into a file or scrape it
     * @param geonamesAdmin1CodesASCII The lookup data for the province names for
     * each place found here:
     * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight the
     * table view, and copy results into a text file. Make sure the tab delimitted
     * format is maintained.
     * @param usgsDataFile the actual USGS gazetteer downloaded from here:
     * http://geonames.usgs.gov/domestic/download_data.htm click on the
     * national_file####.zip link to get all the most recent features
     *
     * @param usgsGovUnitsFile go to here:
     * http://geonames.usgs.gov/domestic/download_data.htm in the section titled
     * "Topical Gazetteers -- File Format" click on the drop down list and select
     * "Government Units". The downloaded file is what you need for this param.
     * @param outputIndexDir where you want the final index. Must be a directory,
     * not an actual file.
     * @param outputCountryContextFile The output countrycontext file. THis is a
     * very important file used inside the GeoEntityLinker to assist in toponym
     * resolution.
     * @param regionsFile this file contains a list of regions in the following
     * format: tab delimitted text with index 0 as the name of the region, index 1
     * as the longitude, and index 2 as the latitude
     * @throws Exception
     */
    public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII, File usgsDataFile,
            File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile)
            throws Exception {
        if (!outputIndexDir.isDirectory()) {
            throw new IllegalArgumentException("outputIndexDir must be a directory.");
        }
        if (!geonamesData.exists()) {
            throw new FileNotFoundException("geonames data file does not exist");
        }
        if (!geoNamesCountryInfo.exists()) {
            throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");
        }
        if (!geonamesAdmin1CodesASCII.exists()) {
            throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");
        }

        if (!usgsDataFile.exists()) {
            throw new FileNotFoundException("usgsDataFile data file does not exist");
        }
        if (!usgsGovUnitsFile.exists()) {
            throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");
        }
        if (!outputIndexDir.exists()) {
            throw new FileNotFoundException("outputIndexDir data file does not exist");
        }
        if (!regionsFile.exists()) {
            throw new FileNotFoundException("regionsFile data file does not exist");
        }

        String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";
        Directory index = new MMapDirectory(Paths.get(indexloc));
        Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList(), true));
        Map<String, Analyzer> analyMap = new HashMap<>();

        analyMap.put("countrycode", new KeywordAnalyzer());
        analyMap.put("admincode", new KeywordAnalyzer());
        analyMap.put("loctype", new KeywordAnalyzer());
        analyMap.put("countycode", new KeywordAnalyzer());
        analyMap.put("gazsource", new KeywordAnalyzer());

        PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(a, analyMap);

        IndexWriterConfig config = new IndexWriterConfig(aWrapper);

        IndexWriter w = new IndexWriter(index, config);

        //write the column headers for the countryContextFile 
        FileWriter countryContextFileWriter = new FileWriter(outputCountryContextFile, false);
        String colNamesForCountryContextFile = "countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";
        countryContextFileWriter.write(colNamesForCountryContextFile);
        countryContextFileWriter.flush();
        countryContextFileWriter.close();

        USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);

        GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData,
                outputCountryContextFile, w);

        RegionProcessor.process(regionsFile, outputCountryContextFile, w);
        w.commit();
        w.close();
        System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '"
                + outputCountryContextFile.getPath() + "' to entitylinker.properties file");
    }

}