org.apache.lucene.search.spell.LuceneLevenshteinDistance.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.spell.LuceneLevenshteinDistance.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.spell;

import org.apache.lucene.util.IntsRef;

/**
 *  Damerau-Levenshtein (optimal string alignment) implemented in a consistent 
 *  way as Lucene's FuzzyTermsEnum with the transpositions option enabled.
 *  
 *  Notes:
 *  <ul>
 *    <li> This metric treats full unicode codepoints as characters
 *    <li> This metric scales raw edit distances into a floating point score
 *         based upon the shortest of the two terms
 *    <li> Transpositions of two adjacent codepoints are treated as primitive 
 *         edits.
 *    <li> Edits are applied in parallel: for example, "ab" and "bca" have 
 *         distance 3.
 *  </ul>
 *  
 *  NOTE: this class is not particularly efficient. It is only intended
 *  for merging results from multiple DirectSpellCheckers.
 */
public final class LuceneLevenshteinDistance implements StringDistance {

    /**
     * Creates a new comparator, mimicing the behavior of Lucene's internal
     * edit distance.
     */
    public LuceneLevenshteinDistance() {
    }

    @Override
    public float getDistance(String target, String other) {
        IntsRef targetPoints;
        IntsRef otherPoints;
        int n;
        int d[][]; // cost array

        // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
        // what LevenshteinDistance does, except cycling thru a ring of three 
        // horizontal cost arrays... but this comparator is never actually used by 
        // DirectSpellChecker, it's only used for merging results from multiple shards 
        // in "distributed spellcheck", and it's inefficient in other ways too...

        // cheaper to do this up front once
        targetPoints = toIntsRef(target);
        otherPoints = toIntsRef(other);
        n = targetPoints.length;
        final int m = otherPoints.length;
        d = new int[n + 1][m + 1];

        if (n == 0 || m == 0) {
            if (n == m) {
                return 0;
            } else {
                return Math.max(n, m);
            }
        }

        // indexes into strings s and t
        int i; // iterates through s
        int j; // iterates through t

        int t_j; // jth character of t

        int cost; // cost

        for (i = 0; i <= n; i++) {
            d[i][0] = i;
        }

        for (j = 0; j <= m; j++) {
            d[0][j] = j;
        }

        for (j = 1; j <= m; j++) {
            t_j = otherPoints.ints[j - 1];

            for (i = 1; i <= n; i++) {
                cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
                d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
                // transposition
                if (i > 1 && j > 1 && targetPoints.ints[i - 1] == otherPoints.ints[j - 2]
                        && targetPoints.ints[i - 2] == otherPoints.ints[j - 1]) {
                    d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
                }
            }
        }

        return 1.0f - ((float) d[n][m] / Math.min(m, n));
    }

    private static IntsRef toIntsRef(String s) {
        IntsRef ref = new IntsRef(s.length()); // worst case
        int utf16Len = s.length();
        for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
            cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
        }
        return ref;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (null == obj)
            return false;
        return (getClass() == obj.getClass());
    }

}