opennlp.addons.geoentitylinker.indexing.RegionProcessor.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.addons.geoentitylinker.indexing.RegionProcessor.java

Source

/*
 * Copyright 2014 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.addons.geoentitylinker.indexing;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;

public class RegionProcessor {

    public static void main(String[] args) {
        RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"),
                new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
    }

    /**
     *
     * @param regionsFile the file that stores Region references. the format of
     * this file is tab delimitted text with index 0 as the name of the region,
     * index 1 as the longitude, and index 2 as the latitude
     * @param outputCountryContextfile this is the country context files shared by
     * all indexing processors
     * @param w
     */
    public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {
        try {
            readFile(regionsFile, outputCountryContextfile, w);
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w)
            throws Exception {
        List<String> ccfileentries = new ArrayList<>();
        BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
        List<String> fields = new ArrayList<>();
        int counter = 0;
        System.out.println("reading gazetteer data from Regions file...........");
        String line = "";
        while ((line = reader.readLine()) != null) {

            String[] values = line.split("\t");
            if (counter == 0) {

            } else {
                Document doc = new Document();
                for (int i = 0; i < fields.size() - 1; i++) {
                    doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
                }
                String placeName = values[0];
                String lat = values[2];
                String lon = values[1];
                String dsg = "region";
                String id = "rg" + counter;

                String hierarchy = placeName;

                doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
                doc.add(new TextField("placename", placeName, Field.Store.YES));
                doc.add(new StringField("latitude", lat, Field.Store.YES));
                doc.add(new StringField("longitude", lon, Field.Store.YES));
                doc.add(new StringField("loctype", dsg, Field.Store.YES));
                doc.add(new StringField("admincode", "", Field.Store.YES));
                doc.add(new StringField("countrycode", id, Field.Store.YES));
                doc.add(new StringField("countycode", "", Field.Store.YES));

                doc.add(new StringField("locid", id, Field.Store.YES));
                doc.add(new StringField("gazsource", "region", Field.Store.YES));
                //countrycontext file format
                // US   KY   131   United States   Kentucky   Leslie

                ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t"
                        + "NO_DATA_FOUND" + "\t" + "(" + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t"
                        + "NO_DATA_FOUND" + "\n");
                if (w != null) {
                    w.addDocument(doc);
                }
            }
            counter++;

        }
        if (w != null) {
            w.commit();
        }
        FileWriter writer = new FileWriter(outputCountryContextfile, true);
        for (String string : ccfileentries) {
            writer.write(string);
        }
        System.out.println("successfully wrote Region entries to country oontext file");
        writer.close();
        System.out.println("Completed indexing regions!");
    }

}