Calculate similarity between two String - Java java.lang

Java examples for java.lang:String Distance

Description

Calculate similarity between two String

Demo Code

/**/*w ww. j a v a  2  s  .  c  om*/
 * Copyright 2010 Molindo GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
//package com.java2s;

public class Main {
    public static void main(String[] argv) {
        String string1 = "java2s.com";
        String string2 = "java2s.com";
        System.out.println(similarity(string1, string2));
    }

    private static final int GAP_COST = 2;

    public static float similarity(final String string1,
            final String string2) {
        return similarity(string1, string2, 0.0f);
    }

    public static float similarity(final String string1,
            final String string2, final float min) {
        if (string1.equals(string2)) {
            return 1.0f;
        }

        final int l1 = string1.length();
        final int l2 = string2.length();

        // max value is to create longest string from ""
        final int maxValue = Math.max(l1, l2) * GAP_COST;

        final float maxGap = (1.0f - min) * maxValue;

        // min value is length difference - abort if this is already > maxGap
        if (Math.abs(l1 - l2) * GAP_COST > maxGap) {
            return 0.0f;
        }

        // calc
        final int needlemanWunch = unnormalisedSimilarity(string1, string2,
                maxGap);

        // return actual / possible NeedlemanWunch distance to get 0-1 range
        final float normalised = 1.0f - needlemanWunch / (float) maxValue;
        return normalised < min ? 0.0f : normalised;

    }

    public static int unnormalisedSimilarity(final String string1,
            final String string2) {
        return unnormalisedSimilarity(string1, string2, Float.MAX_VALUE);
    }

    /**
     * implements the NeedlemanWunch distance function.
     * 
     * @param maxGap
     * 
     * @param s
     * @param t
     * @param maxGap
     * @return the NeedlemanWunch distance for the given strings
     */
    public static int unnormalisedSimilarity(final String string1,
            final String string2, final float maxGap) {
        final char[] s = string1.toCharArray();
        final char[] t = string2.toCharArray();

        final int[][] d; // matrix
        final int n; // length of s
        final int m; // length of t
        int i; // iterates through s
        int j; // iterates through t
        int cost; // cost

        // check for zero length input
        n = s.length;
        m = t.length;
        if (n == 0) {
            return m;
        }
        if (m == 0) {
            return n;
        }

        // create matrix (n+1)x(m+1)
        d = new int[n + 1][m + 1];

        // put row and column numbers in place
        for (i = 0; i <= n; i++) {
            d[i][0] = i;
        }
        for (j = 0; j <= m; j++) {
            d[0][j] = j;
        }

        // cycle through rest of table filling values from the lowest cost value
        // of the three part cost function
        for (i = 1; i <= n; i++) {
            int rowMin = 0;
            for (j = 1; j <= m; j++) {
                // get the substution cost
                cost = cost(s, i - 1, t, j - 1);

                // find lowest cost at point from three possible
                d[i][j] = min3(d[i - 1][j] + GAP_COST, d[i][j - 1]
                        + GAP_COST, d[i - 1][j - 1] + cost);

                if (d[i][j] < rowMin) {
                    rowMin = d[i][j];
                }
            }
            if (rowMin > maxGap) {
                // break - it will exceed maxGap
                return rowMin;
            }
        }

        // return bottom right of matrix as holds the maximum edit score
        return d[n][m];
    }

    /**
     * get cost between characters where d(i,j) = 1 if i does not equals j, 0 if
     * i equals j.
     * 
     * @param str1
     *            - the string1 to evaluate the cost
     * @param string1Index
     *            - the index within the string1 to test
     * @param str2
     *            - the string2 to evaluate the cost
     * @param string2Index
     *            - the index within the string2 to test
     * @return the cost of a given substitution d(i,j) where d(i,j) = 1 if i!=j,
     *         0 if i==j
     */
    public static final int cost(final char[] str1, final int string1Index,
            final char[] str2, final int string2Index) {
        if (str1[string1Index] == str2[string2Index]) {
            return 0;
        } else {
            return 1;
        }
    }

    private static int min3(final int x, final int y, final int z) {
        final int min = y <= z ? y : z;
        return x <= min ? x : min;
    }
}

Related Tutorials