com.browseengine.local.service.geocode.RangeSearcher.java Source code

Java tutorial

Introduction

Here is the source code for com.browseengine.local.service.geocode.RangeSearcher.java

Source

/**
 * Bobo Browse Engine - High performance faceted/parametric search implementation 
 * that handles various types of semi-structured data.  Written in Java.
 * 
 * Copyright (C) 2006  Spackle
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * To contact the project administrators for the bobo-browse project, 
 * please go to https://sourceforge.net/projects/bobo-browse/.
 */

package com.browseengine.local.service.geocode;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;

import com.browseengine.local.service.index.RangesFields;

/**
 * @author spackle
 *
 */
public class RangeSearcher {
    private static final Logger LOGGER = Logger.getLogger(RangeSearcher.class);

    private Searcher _searcher; // TLID,isLeft,isNumeric,from,to,zip5
    private IndexReader _reader;
    private int _maxDoc;
    private BitSet _isLeft;
    private BitSet _isNumeric;

    public RangeSearcher(File f) throws IOException {
        try {
            _reader = IndexReader.open(f);
            _searcher = new IndexSearcher(_reader);
            _maxDoc = _reader.maxDoc();
            _isLeft = loadBitField(RangesFields.LEFT.getField());
            _isNumeric = loadBitField(RangesFields.IS_NUMERIC.getField());
        } catch (IOException ioe) {
            try {
                close();
            } catch (IOException ioe2) {
                LOGGER.warn("trouble closing NameSearcher, after trouble opening it", ioe2);
            }
            throw ioe;
        }
    }

    public synchronized void close() throws IOException {
        try {
            if (_searcher != null) {
                _searcher.close();
            }
        } finally {
            try {
                if (_reader != null) {
                    _reader.close();
                }
            } finally {
                _searcher = null;
                _reader = null;
            }
        }
    }

    private BitSet loadBitField(String fieldName) throws IOException {
        Term term = new Term(fieldName, "1");
        TermDocs termDocs = null;
        try {
            BitSet bits = new BitSet(_maxDoc);
            termDocs = _reader.termDocs(term);
            while (termDocs.next()) {
                bits.set(termDocs.doc());
            }
            return bits;
        } finally {
            if (termDocs != null) {
                termDocs.close();
            }
        }
    }

    private int getZip5(Document doc) {
        if (doc != null) {
            String tmp = doc.get(RangesFields.ZIP5.getField());
            if (tmp != null) {
                Matcher m = IS_NUMERIC.matcher(tmp);
                if (m.matches()) {
                    return Integer.parseInt(tmp);
                }
            }
        }
        return -1;
    }

    private int getTLID(int docid) throws IOException {
        Document doc = _reader.document(docid);
        return Integer.parseInt(doc.get(RangesFields.TLID.getField()));
    }

    /**
     * assumes: matches.tlidSideMap.size() > 0.
     * 
     * @param matches
     * @param streetNumber
     * @param zip5
     * @return
     * @throws IOException
     */
    public RangeMatch narrowToRange(PlaceMatch matches, String streetNumber, int zip5)
            throws IOException, GeoCodingException {
        TermDocs termDocs = null;
        try {
            termDocs = _reader.termDocs();
            if (streetNumber == null || streetNumber.length() <= 0) {
                // no street number was specified as part of the input
                return takeAnyRange(termDocs, matches, zip5);
            } else {
                return findBestRange(termDocs, matches, streetNumber, zip5);
            }
        } finally {
            try {
                if (termDocs != null) {
                    termDocs.close();
                }
            } catch (IOException ioe) {
                LOGGER.warn("trouble closing termdocs at end of narrow to range: " + ioe, ioe);
            }
        }
    }

    /**
     * we don't support numeric addresses beyond 999999999.
     */
    public static final Pattern IS_NUMERIC = Pattern.compile("\\A(\\d{1,9})\\z");
    public static final Pattern HAS_NUMERIC = Pattern.compile("(\\d{1,9})");

    private RangeMatch findBestRange(TermDocs termDocs, PlaceMatch matches, String streetNumber, int zip5)
            throws IOException, GeoCodingException {
        boolean isNumeric = false;
        int target = -1;
        Matcher m = IS_NUMERIC.matcher(streetNumber);
        if (m.matches()) {
            isNumeric = true;
            target = Integer.parseInt(streetNumber);
        }
        if (isNumeric) {
            // try to do it numerically 
            Integer[] tlids = getSortedTlids(matches);
            for (Integer tlid : tlids) {
                Term term = new Term(RangesFields.TLID.getField(), tlid.toString());
                termDocs.seek(term);
                while (termDocs.next()) {
                    int docid = termDocs.doc();
                    if (_isNumeric.get(docid)) {
                        TermFreqVector frVec = _reader.getTermFreqVector(docid, RangesFields.FROM.getField());
                        TermFreqVector toVec = _reader.getTermFreqVector(docid, RangesFields.TO.getField());
                        if (frVec != null && frVec.size() > 0 && toVec != null && toVec.size() > 0) {
                            int fr = Integer.parseInt(frVec.getTerms()[0]);
                            int to = Integer.parseInt(toVec.getTerms()[0]);
                            if (fr < to) {
                                if (fr <= target && target <= to) {
                                    // hit!
                                    Document doc = _reader.document(docid);
                                    RangeMatch match = new RangeMatch();
                                    match.lrboth = (_isLeft.get(docid) ? PlaceMatch.LEFT : PlaceMatch.RIGHT);
                                    match.number = "" + target;
                                    match.tlid = tlid;
                                    match.toPercent = (fr < to ? ((float) target - fr) / ((float) to - fr) : 0f);
                                    match.zip5 = getZip5(doc);
                                    return match;
                                }
                            } else {
                                if (to <= target && target <= fr) {
                                    // hit!
                                    Document doc = _reader.document(docid);
                                    RangeMatch match = new RangeMatch();
                                    match.lrboth = (_isLeft.get(docid) ? PlaceMatch.LEFT : PlaceMatch.RIGHT);
                                    match.number = "" + target;
                                    match.tlid = tlid;
                                    match.toPercent = (fr > to ? ((float) target - to) / ((float) fr - to) : 0f);
                                    match.zip5 = getZip5(doc);
                                    return match;
                                }
                            }
                        }
                    }
                }
            }
        }
        // pure numeric search didn't work--try to match the numeric _portion_

        // TODO: what to do if we're about to return null?
        // for now, just return any range
        // TODO: in the future, keep track of which of the above was the 
        // closest numerically, and return that one instead.
        // TODO: we probably want an indicator to the client, that we are taking a range 
        // that we didn't necessarily find exactly
        return takeAnyRange(termDocs, matches, zip5);
    }

    private Integer[] merge(Integer[] tlids, PlaceMatch matches, Integer[] zipTlids, PlaceMatch zipMatches) {
        List<Integer> retList = new ArrayList<Integer>();
        int i = 0;
        int j = 0;
        while (i < tlids.length && j < zipTlids.length) {
            if (tlids[i] == zipTlids[j]) {
                short rlboth = matches.tlidSideMap.get(tlids[i]);
                short zipRlboth = zipMatches.tlidSideMap.get(zipTlids[j]);
                if (rlboth == PlaceMatch.BOTH || zipRlboth == PlaceMatch.BOTH || rlboth == zipRlboth) {
                    retList.add(tlids[i]);
                }
                i++;
                j++;
            } else if (tlids[i] < zipTlids[j]) {
                // advance i
                i++;
            } else {
                // advance j
                j++;
            }
        }
        if (retList.size() > 0) {
            Integer[] ret = new Integer[retList.size()];
            retList.toArray(ret);
            return ret;
        } else {
            return null;
        }
    }

    private Integer[] getSortedTlids(PlaceMatch matches) {
        Set<Integer> keys = matches.tlidSideMap.keySet();
        Integer[] tlids = new Integer[keys.size()];
        keys.toArray(tlids);
        Arrays.sort(tlids);
        return tlids;
    }

    /**
     * no range specified, take the lowest.
     * 
     * @param termDocs
     * @param matches
     * @param zip5
     * @return
     * @throws IOException
     * @throws GeoCodingException
     */
    private RangeMatch takeAnyRange(TermDocs termDocs, PlaceMatch matches, int zip5)
            throws IOException, GeoCodingException {
        Integer[] tlids = getSortedTlids(matches);

        // match with zip also if possible
        if (zip5 > 0 && zip5 <= 99999) {
            PlaceMatch zipMatches = lookupZip(zip5);
            if (zipMatches != null && zipMatches.tlidSideMap.size() > 0) {
                Integer[] zipTlids = getSortedTlids(zipMatches);

                Integer[] tmp = merge(tlids, matches, zipTlids, zipMatches);
                tlids = (tmp != null && tmp.length > 0 ? tmp : tlids);
            }
        }

        int minNumeric = Integer.MAX_VALUE;
        int minDocid = -1;
        double minLon = Double.MAX_VALUE;
        double minLat = Double.MAX_VALUE;
        boolean foundNumeric = false; // numeric is preferred
        boolean from = true;
        String minAlpha = new String("zzz");
        // the desired result is the lowest numeric address
        // failing that, the desired result is the lowest alpha address
        for (int i = 0; i < tlids.length; i++) {
            Term term = new Term(RangesFields.TLID.getField(), "" + tlids[i]);
            termDocs.seek(term);
            while (termDocs.next()) {
                int docid = termDocs.doc();
                TermFreqVector frVec = _reader.getTermFreqVector(docid, RangesFields.FROM.getField());
                TermFreqVector toVec = _reader.getTermFreqVector(docid, RangesFields.TO.getField());
                if (frVec != null && frVec.size() > 0 && toVec != null && toVec.size() > 0) {
                    if (_isNumeric.get(docid)) {
                        int fr = Integer.parseInt(frVec.getTerms()[0]);
                        int to = Integer.parseInt(toVec.getTerms()[0]);
                        if (fr < to) {
                            if (fr < minNumeric) {
                                minNumeric = fr;
                                minDocid = docid;
                                from = true;
                                foundNumeric = true;
                            }
                        } else {
                            if (to < minNumeric) {
                                minNumeric = to;
                                minDocid = docid;
                                from = false;
                                foundNumeric = true;
                            }
                        }
                    } else if (!foundNumeric) {
                        String fr = frVec.getTerms()[0];
                        String to = toVec.getTerms()[0];
                        if (fr.compareTo(to) < 0) {
                            if (fr.compareTo(minAlpha) < 0) {
                                minAlpha = fr;
                                minDocid = docid;
                                from = true;
                            }
                        } else {
                            if (to.compareTo(minAlpha) < 0) {
                                minAlpha = to;
                                minDocid = docid;
                                from = false;
                            }
                        }
                    }
                }
            }
            if (minDocid >= 0) {
                // we have a winner!
                Document doc = _reader.document(minDocid);
                RangeMatch match = new RangeMatch();
                match.tlid = tlids[i];
                match.lrboth = (_isLeft.get(minDocid) ? PlaceMatch.LEFT : PlaceMatch.RIGHT);//matches.tlidSideMap.get(match.tlid);
                match.number = (foundNumeric ? "" + minNumeric : minAlpha);
                // we will be interpolating, so we should go a tiny bit from the intersetion
                match.toPercent = (from ? 0.01f : 0.99f);
                match.zip5 = getZip5(doc);
                return match;
            }
        }

        // TODO: what to do if we're about to return null?
        // repeat without a zip5
        if (zip5 > 0) {
            return takeAnyRange(termDocs, matches, -1);
        } else {
            return null;
        }
    }

    public PlaceMatch lookupZip(int zip5) throws GeoCodingException, IOException {
        if (zip5 <= 0 || zip5 > 99999) {
            LOGGER.warn("can't lookup a zip code out of range: " + zip5 + "; returning empty results");
            return new PlaceMatch();
        }
        TermDocs termDocs = null;
        try {
            PlaceMatch match = new PlaceMatch();
            Term term = new Term(RangesFields.ZIP5.getField(), "" + zip5);
            termDocs = _reader.termDocs(term);
            while (termDocs.next()) {
                int docid = termDocs.doc();
                boolean isLeft = _isLeft.get(docid);
                int tlid = getTLID(docid);
                Short val = match.tlidSideMap.get(tlid);
                if (val != null) {
                    short lrboth = PlaceMatch.getLRBoth(val);
                    if (isLeft) {
                        if (lrboth == PlaceMatch.RIGHT) {
                            val = PlaceMatch.setLRBoth(val, PlaceMatch.BOTH);
                            match.tlidSideMap.put(tlid, val);
                        }
                    } else if (lrboth == PlaceMatch.LEFT) {
                        val = PlaceMatch.setLRBoth(val, PlaceMatch.BOTH);
                        match.tlidSideMap.put(tlid, val);
                    }
                } else {
                    val = PlaceMatch.setLRBoth((short) 0, (isLeft ? PlaceMatch.LEFT : PlaceMatch.RIGHT));
                    match.tlidSideMap.put(tlid, val);
                }
            }
            return match;
        } finally {
            try {
                if (termDocs != null) {
                    termDocs.close();
                }
            } catch (IOException ioe) {
                LOGGER.warn("trouble closing term docs during lookup zip: " + ioe, ioe);
            }
        }
    }
}