com.berico.clavin.resolver.lucene.LuceneLocationResolver.java Source code

Introduction

Here is the source code for com.berico.clavin.resolver.lucene.LuceneLocationResolver.java
Source

package com.berico.clavin.resolver.lucene;

import static org.apache.lucene.queryparser.classic.QueryParserBase.escape;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

import com.berico.clavin.extractor.LocationOccurrence;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.berico.clavin.gazetteer.CountryCode;
import com.berico.clavin.index.BinarySimilarity;
import com.berico.clavin.index.WhitespaceLowerCaseAnalyzer;
import com.berico.clavin.resolver.LocationResolver;
import com.berico.clavin.resolver.ResolvedLocation;
import com.berico.clavin.util.ListUtils;

/*#####################################################################
 * 
 * CLAVIN (Cartographic Location And Vicinity INdexer)
 * ---------------------------------------------------
 * 
 * Copyright (C) 2012-2013 Berico Technologies
 * http://clavin.bericotechnologies.com
 * 
 * ====================================================================
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 * 
 * ====================================================================
 * 
 * LuceneLocationResolver.java
 * 
 *###################################################################*/

/**
 * Resolves location names into GeoName objects.
 * 
 * Takes location names extracted from unstructured text documents by
 * {@link LocationExtractor} and resolves them into the appropriate
 * geographic entities (as intended by the document's author based on
 * context) by finding the best match in a gazetteer.
 * 
 */
public class LuceneLocationResolver implements LocationResolver {

    public final static Logger logger = LoggerFactory.getLogger(LuceneLocationResolver.class);

    // Lucene index built from GeoNames gazetteer
    private FSDirectory index;
    private IndexSearcher indexSearcher;
    private static Analyzer indexAnalyzer;

    // maximum number of matches to be fetched from Lucene index
    // (i.e., search depth) -- use a value of 1 to simply retrieve the
    // matching geo entity having the highest population
    private int maxHitDepth;

    // maximum number of adjacent location name to consider during
    // heuristic matching (i.e., search breadth) -- use a value of 1 to
    // turn off context-based heuristics
    private int maxContextWindow;

    // custom Lucene sorting based on Lucene match score and the
    // population of the GeoNames gazetteer entry represented by the
    // matched index document
    private static final Sort populationSort = new Sort(
            new SortField[] { SortField.FIELD_SCORE, new SortField("population", SortField.Type.LONG, true) });

    /**
     * Builds a {@link LuceneLocationResolver} by loading a pre-built Lucene
     * index from disk and setting configuration parameters for
     * resolving location names to GeoName objects.
     * 
     * @param indexDir            Lucene index directory to be loaded
     * @param maxHitDepth         number of candidate matches to consider
     * @param maxContextWindow      how much context to consider when resolving
     * @throws IOException
     * @throws ParseException
     */
    public LuceneLocationResolver(File indexDir, int maxHitDepth, int maxContextWindow)
            throws IOException, ParseException {

        // load the Lucene index directory from disk
        index = FSDirectory.open(indexDir);

        // index employs simple lower-casing & tokenizing on whitespace
        indexAnalyzer = new WhitespaceLowerCaseAnalyzer();
        indexSearcher = new IndexSearcher(DirectoryReader.open(index));

        // override default TF/IDF score to ignore multiple appearances
        indexSearcher.setSimilarity(new BinarySimilarity());

        this.maxHitDepth = maxHitDepth;
        this.maxContextWindow = maxContextWindow;

        // run an initial throw-away query just to "prime the pump" for
        // the cache, so we can accurately measure performance speed
        // per: http://wiki.apache.org/lucene-java/ImproveSearchingSpeed
        indexSearcher.search(
                new AnalyzingQueryParser(Version.LUCENE_40, "indexName", indexAnalyzer).parse("Reston"), null,
                maxHitDepth, populationSort);
    }

    /**
     * Finds all matches (capped at {@link LuceneLocationResolver#maxHitDepth})
     * in the Lucene index for a given location name.
     * 
     * @param locationName      name of the geographic location to be resolved
     * @param fuzzy            switch for turning on/off fuzzy matching
     * @return               list of ResolvedLocation objects as potential matches
     * @throws IOException
     * @throws ParseException
     */
    private List<ResolvedLocation> getCandidateMatches(LocationOccurrence locationName, boolean fuzzy)
            throws IOException, ParseException {

        // santize the query input
        String sanitizedLocationName = escape(locationName.text.toLowerCase());

        try {
            // Lucene query used to look for matches based on the
            // "indexName" field
            Query q = new AnalyzingQueryParser(Version.LUCENE_40, "indexName", indexAnalyzer)
                    .parse("\"" + sanitizedLocationName + "\"");

            // collect all the hits up to maxHits, and sort them based
            // on Lucene match score and population for the associated
            // GeoNames record
            TopDocs results = indexSearcher.search(q, null, maxHitDepth, populationSort);

            // initialize the return object
            List<ResolvedLocation> candidateMatches = new ArrayList<ResolvedLocation>();

            // see if anything was found
            if (results.scoreDocs.length > 0) {
                // one or more exact String matches found for this location name
                for (int i = 0; i < results.scoreDocs.length; i++) {
                    // add each matching location to the list of candidates
                    ResolvedLocation location = new ResolvedLocation(indexSearcher.doc(results.scoreDocs[i].doc),
                            locationName, false);
                    logger.debug("{}", location);
                    candidateMatches.add(location);
                }
            } else if (fuzzy) { // only if fuzzy matching is turned on
                // no exact String matches found -- fallback to fuzzy search

                // Using the tilde "~" makes this a fuzzy search. I compared this to FuzzyQuery
                // with TopTermsBoostOnlyBooleanQueryRewrite, I like the output better this way.
                // With the other method, we failed to match things like "Straenhaus Airport"
                // as <Straenhaus>, and the match scores didn't make as much sense.
                q = new AnalyzingQueryParser(Version.LUCENE_40, "indexName", indexAnalyzer)
                        .parse(sanitizedLocationName + "~");

                // collect all the fuzzy matches up to maxHits, and sort
                // them based on Lucene match score and population for the
                // associated GeoNames record
                results = indexSearcher.search(q, null, maxHitDepth, populationSort);

                // see if anything was found with fuzzy matching
                if (results.scoreDocs.length > 0) {
                    // one or more fuzzy matches found for this location name
                    for (int i = 0; i < results.scoreDocs.length; i++) {
                        // add each matching location to the list of candidates
                        ResolvedLocation location = new ResolvedLocation(
                                indexSearcher.doc(results.scoreDocs[i].doc), locationName, true);
                        logger.debug(location + "{fuzzy}");
                        candidateMatches.add(location);
                    }
                } else {
                    // drats, foiled again! no fuzzy matches found either!
                    // in this case, we'll return an empty list of
                    // candidate matches
                    logger.debug("No match found for: '{}'", locationName);
                }
            } else {
                // no matches found and fuzzy matching is turned off
                logger.debug("No match found for: '{}'", locationName);
            }

            return candidateMatches;

        } catch (ParseException e) {
            logger.error(String.format("Error resolving location for : '%s'", locationName), e);
            throw e;
        } catch (IOException e) {
            logger.error(String.format("Error resolving location for : '%s'", locationName), e);
            throw e;
        }
    }

    /**
     * Uses heuristics to select the best match for each location name
     * extracted from a document, choosing from among a list of lists
     * of candidate matches.
     * 
     * Although not guaranteeing an optimal solution (enumerating &
     * evaluating each possible combination is too costly), it does a
     * decent job of cracking the "Springfield Problem" by selecting
     * candidates that would make sense to appear together based on
     * common country and admin1 codes (i.e., states or provinces).
     * 
     * For example, if we also see "Boston" mentioned in a document 
     * that contains "Springfield," we'd use this as a clue that we
     * ought to choose Springfield, MA over Springfield, IL or
     * Springfield, MO.
     * 
     * TODO: consider lat/lon distance in addition to shared
     *        CountryCodes and Admin1Codes.
     * 
     * @param allCandidates   list of lists of candidate matches for locations names
     * @return            list of best matches for each location name
     */
    private List<ResolvedLocation> pickBestCandidates(List<List<ResolvedLocation>> allCandidates) {

        // initialize return object
        List<ResolvedLocation> bestCandidates = new ArrayList<ResolvedLocation>();

        // variables used in heuristic matching
        List<CountryCode> countries;
        List<String> states;
        float score;

        // initial values for variables controlling recursion
        float newMaxScore = 0;
        float oldMaxScore = 0;

        // controls window of Lucene hits for each location considered
        // context-based heuristic matching, initialized as a "magic
        // number" of *3* based on tests of the "Springfield Problem"
        int candidateDepth = 3;

        // keep searching deeper & deeper for better combinations of
        // candidate matches, as long as the scores are improving
        do {
            // reset the threshold for recursion
            oldMaxScore = newMaxScore;

            // loop through all combinations up to the specified depth.
            // first recursive call for each depth starts at index 0
            for (List<ResolvedLocation> combo : generateAllCombos(allCandidates, 0, candidateDepth)) {
                // these lists store the country codes & admin1 codes for each candidate
                countries = new ArrayList<CountryCode>();
                states = new ArrayList<String>();
                for (ResolvedLocation location : combo) {
                    countries.add(location.geoname.primaryCountryCode);
                    states.add(location.geoname.admin1Code);
                }

                // unique-ify the lists to look for common country codes & admin1 codes
                countries = new ArrayList<CountryCode>(new HashSet<CountryCode>(countries));
                states = new ArrayList<String>(new HashSet<String>(states));

                // calculate a score for this particular combination based on commonality
                // of country codes & admin1 codes, and the cost of searching this deep
                // TODO: tune this score calculation!
                score = ((float) allCandidates.size() / (countries.size() + states.size())) / candidateDepth;

                /* *********************************************************** 
                 * "So, at last we meet for the first time for the last time."
                 * 
                 * The fact that you're interested enough in CLAVIN to be
                 * reading this means we're interested in talking with you.
                 * 
                 * Are you looking for a job, or are you in need of a
                 * customized solution built around CLAVIN?
                 * 
                 * Drop us a line at clavin@bericotechnologies.com
                 * 
                 * "What's the matter, Colonel Sandurz? CHICKEN?"
                 * **********************************************************/

                // if this is the best we've seen during this loop, update the return value
                if (score > newMaxScore) {
                    newMaxScore = score;
                    bestCandidates = combo;
                }
            }

            // search one level deeper in the next loop
            candidateDepth++;

        } while (newMaxScore > oldMaxScore);
        // keep searching while the scores are monotonically increasing

        return bestCandidates;
    }

    /**
     * Recursive helper function for
     * {@link LocationResolver#pickBestCandidates(List<List<ResolvedLocation>>)}.
     * 
     * Generates all combinations of candidate matches from each
     * location, down to the specified depth through the lists.
     * 
     * Adapted from:
     * http://www.daniweb.com/software-development/java/threads/177956/generating-all-possible-combinations-from-list-of-sublists#post882553
     * 
     * @param allCandidates   list of lists of candidate matches for all location names
     * @param index         keeps track of which location we're working on for recursive calls
     * @param depth         max depth into list we're searching during this recursion
     * @return            all combinations of candidate matches for each location, down to the specified depth
     */
    private List<List<ResolvedLocation>> generateAllCombos(List<List<ResolvedLocation>> allCandidates, int index,
            int depth) {

        // stopping condition
        if (index == allCandidates.size()) {
            // return a list with an empty list
            List<List<ResolvedLocation>> result = new ArrayList<List<ResolvedLocation>>();
            result.add(new ArrayList<ResolvedLocation>());
            return result;
        }

        // initialize return object
        List<List<ResolvedLocation>> result = new ArrayList<List<ResolvedLocation>>();

        // recursive call
        List<List<ResolvedLocation>> recursive = generateAllCombos(allCandidates, index + 1, depth);

        // for each element of the first list of input, up to depth or list size
        for (int j = 0; j < Math.min(allCandidates.get(index).size(), depth); j++) {
            // add the element to all combinations obtained for the rest of the lists
            for (int k = 0; k < recursive.size(); k++) {
                List<ResolvedLocation> newList = new ArrayList<ResolvedLocation>();
                // add element of the first list
                newList.add(allCandidates.get(index).get(j));
                // copy a combination from recursive
                for (ResolvedLocation listItem : recursive.get(k))
                    newList.add(listItem);
                // add new combination to result
                result.add(newList);
            }
        }

        return result;
    }

    /**
     * Resolves the supplied list of location names into
     * {@link ResolvedLocation}s containing {@link GeoName} objects.
     * 
     * Calls {@link LuceneLocationResolver#getCandidateMatches(LocationOccurrence, boolean)} on
     * each location name to find all possible matches, then uses
     * heuristics to select the best match for each by calling
     * {@link LocationResolver#pickBestCandidates(List<List<ResolvedLocation>>)}.
     * 
     * @param locations       list of location names to be resolved
     * @param fuzzy            switch for turning on/off fuzzy matching
     * @return                list of {@link ResolvedLocation} objects
     * @throws ParseException 
     * @throws IOException 
     **/
    @Override
    public List<ResolvedLocation> resolveLocations(List<LocationOccurrence> locations, boolean fuzzy)
            throws IOException, ParseException {

        // forgetting something?
        if (locations == null)
            return new ArrayList<ResolvedLocation>();

        if (maxHitDepth > 1) { // perform context-based heuristic matching

            // stores all possible matches for each location name
            List<List<ResolvedLocation>> allCandidates = new ArrayList<List<ResolvedLocation>>();

            // loop through all the location names
            for (LocationOccurrence location : locations) {
                // get all possible matches
                List<ResolvedLocation> candidates = getCandidateMatches(location, fuzzy);

                // if we found some possible matches, save them
                if (candidates.size() > 0)
                    allCandidates.add(candidates);
            }

            // initialize return object
            List<ResolvedLocation> bestCandidates = new ArrayList<ResolvedLocation>();

            // split-up allCandidates into reasonably-sized chunks to
            // limit computational load when heuristically selecting
            // the best matches
            for (List<List<ResolvedLocation>> theseCandidates : ListUtils.chunkifyList(allCandidates,
                    maxContextWindow)) {
                // select the best match for each location name based
                // based on heuristics
                bestCandidates.addAll(pickBestCandidates(theseCandidates));
            }

            return bestCandidates;

        } else { // use no heuristics, simply choose matching location with greatest population

            // initialize return object
            List<ResolvedLocation> resolvedLocations = new ArrayList<ResolvedLocation>();

            // stores possible matches for each location name
            List<ResolvedLocation> candidateLocations;

            // loop through all the location names
            for (LocationOccurrence location : locations) {
                // choose the top-sorted candidate for each individual
                // location name
                candidateLocations = getCandidateMatches(location, fuzzy);

                // if a match was found, add it to the return list
                if (candidateLocations.size() > 0)
                    resolvedLocations.add(candidateLocations.get(0));
            }

            return resolvedLocations;
        }
    }

}