solr2155.lucene.spatial.geohash.GeoHashPrefixFilter.java Source code

Introduction

Here is the source code for solr2155.lucene.spatial.geohash.GeoHashPrefixFilter.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package solr2155.lucene.spatial.geohash;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet;
import solr2155.lucene.TermsEnumCompatibility;
import solr2155.lucene.spatial.geometry.shape.Geometry2D;
import solr2155.lucene.spatial.geometry.shape.IntersectCase;
import solr2155.lucene.spatial.geometry.shape.Point2D;

import java.io.IOException;
import java.util.LinkedList;

/**
 * Performs a spatial filter against a field indexed using Geohashes. Using the hierarchical grid nature of geohashes,
 * this filter recursively traverses each precision length and uses methods on {@link Geometry2D} to efficiently know
 * that all points at a geohash fit in the shape or not to either short-circuit unnecessary traversals or to efficiently
 * load all enclosed points.
 */
public class GeoHashPrefixFilter extends Filter {

    private static final int GRIDLEN_SCAN_THRESHOLD = 4;//>= 1
    private final String fieldName;//interned
    private final Geometry2D geoShape;
    private final GridNode.GridReferenceSystem gridReferenceSystem;

    public GeoHashPrefixFilter(String fieldName, Geometry2D geoShape,
            GridNode.GridReferenceSystem gridReferenceSystem) {
        this.fieldName = fieldName.intern();
        this.geoShape = geoShape;
        this.gridReferenceSystem = gridReferenceSystem;
    }

    @Override
    public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
        final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
        final TermsEnumCompatibility termsEnum = new TermsEnumCompatibility(reader, fieldName);//Lucene 4 compatibility wrapper
        final TermDocs termDocs = reader.termDocs();
        Term term = termsEnum.term();//the most recent term examined via termsEnum.term()
        if (term == null)
            return bits;

        //TODO Add a precision short-circuit so that we are not accurate on the edge but we're faster.

        //TODO An array based nodes impl would be more efficient; or a stack of iterators.  LinkedList conveniently has bulk add to beginning.
        LinkedList<GridNode> nodes = new LinkedList<GridNode>(
                gridReferenceSystem.getSubNodes(geoShape.boundingRectangle()));
        while (!nodes.isEmpty() && term != null) {
            final GridNode node = nodes.removeFirst();
            assert node.length() > 0;
            if (!node.contains(term.text()) && node.before(term.text()))
                continue;//short circuit, moving >= the next indexed term
            IntersectCase intersection = geoShape.intersect(node.getRectangle());
            if (intersection == IntersectCase.OUTSIDE)
                continue;
            TermsEnumCompatibility.SeekStatus seekStat = termsEnum.seek(node.getTermVal());
            term = termsEnum.term();
            if (seekStat != TermsEnumCompatibility.SeekStatus.FOUND)
                continue;
            if (intersection == IntersectCase.CONTAINS) {
                termDocs.seek(term);
                addDocs(termDocs, bits);
                term = termsEnum.next();//move to next term
            } else {//any other intersection
                //TODO is it worth it to optimize the shape (e.g. potentially simpler polygon)?
                //GeoShape geoShape = this.geoShape.optimize(intersection);

                //We either scan through the leaf node(s), or if there are many points then we divide & conquer.
                boolean manyPoints = node.length() < gridReferenceSystem.maxLen - GRIDLEN_SCAN_THRESHOLD;

                //TODO Try variable depth strategy:
                //IF configured to do so, we could use term.freq() as an estimate on the number of places at this depth.  OR, perhaps
                //  make estimates based on the total known term count at this level?  Or don't worry about it--use fixed depth.
                //        if (manyPoints) {
                //          //Make some estimations on how many points there are at this level and how few there would need to be to set
                //          // manyPoints to false.
                //
                //          long termsThreshold = (long) estimateNumberIndexedTerms(node.length(),geoShape.getDocFreqExpenseThreshold(node));
                //
                //          long thisOrd = termsEnum.ord();
                //          manyPoints = (termsEnum.seek(thisOrd+termsThreshold+1) != TermsEnum.SeekStatus.END
                //                  && node.contains(termsEnum.term()));
                //          termsEnum.seek(thisOrd);//return to last position
                //        }

                if (!manyPoints) {
                    //traverse all leaf terms within this node to see if they are within the geoShape, one by one.
                    for (; term != null && node.contains(term.text()); term = termsEnum.next()) {
                        if (term.text().length() < gridReferenceSystem.maxLen)//not a leaf
                            continue;
                        final Point2D point = gridReferenceSystem.decodeXY(term.text());
                        //Filter those out of the shape.
                        if (!geoShape.contains(point))
                            continue;

                        //record
                        termDocs.seek(term);
                        addDocs(termDocs, bits);
                    }
                } else {
                    //divide & conquer
                    nodes.addAll(0, node.getSubNodes());//add to beginning
                }
            }
        } //node loop

        return bits;
    }

    //  double estimateNumberIndexedTerms(int nodeLen,double points) {
    //    return 1000;
    //    double levelProb = probabilityNumNodes[points];// [1,32)
    //    if (nodeLen < geohashLength)
    //      return levelProb + levelProb * estimateNumberIndexedTerms(nodeLen+1,points/levelProb);
    //    return levelProb;
    //  }

    private void addDocs(TermDocs termDocs, OpenBitSet bits) throws IOException {
        while (termDocs.next()) {
            bits.set(termDocs.doc());
        }
    }

    @Override
    public String toString() {
        return "GeoFilter{fieldName='" + fieldName + '\'' + ", shape=" + geoShape + '}';
    }

    @Override
    public boolean equals(Object o) {
        if (this == o)
            return true;
        if (o == null || getClass() != o.getClass())
            return false;

        GeoHashPrefixFilter that = (GeoHashPrefixFilter) o;

        if (fieldName != null ? !fieldName.equals(that.fieldName) : that.fieldName != null)
            return false;
        if (geoShape != null ? !geoShape.equals(that.geoShape) : that.geoShape != null)
            return false;

        return true;
    }

    @Override
    public int hashCode() {
        int result = fieldName != null ? fieldName.hashCode() : 0;
        result = 31 * result + (geoShape != null ? geoShape.hashCode() : 0);
        return result;
    }

}