com.google.errorprone.names.TermEditDistance.java Source code

Java tutorial

Introduction

Here is the source code for com.google.errorprone.names.TermEditDistance.java

Source

/*
 * Copyright 2017 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.errorprone.names;

import blogspot.software_and_algorithms.stern_library.optimization.HungarianAlgorithm;
import com.google.common.collect.ImmutableList;
import java.util.function.BiFunction;
import java.util.stream.DoubleStream;

/**
 * A utility class for finding the distance between two identifiers. Each identifier is split into
 * its constituent terms (based on camel case or underscore naming conventions). Then the edit
 * distance between each term is computed and the minimum cost assignment is found.
 */
public class TermEditDistance {

    private final BiFunction<String, String, Double> editDistanceFn;
    private final BiFunction<Integer, Integer, Double> maxDistanceFn;

    /**
     * Creates a TermEditDistance Object
     *
     * @param editDistanceFn function to compute the distance between two terms
     * @param maxDistanceFn function to compute the worst case distance between two terms
     */
    public TermEditDistance(BiFunction<String, String, Double> editDistanceFn,
            BiFunction<Integer, Integer, Double> maxDistanceFn) {
        this.editDistanceFn = editDistanceFn;
        this.maxDistanceFn = maxDistanceFn;
    }

    public TermEditDistance() {
        this((s, t) -> (double) LevenshteinEditDistance.getEditDistance(s, t, /*isCaseSensitive*/ false),
                (s, t) -> (double) LevenshteinEditDistance.getWorstCaseEditDistance(s, t));
    }

    public double getNormalizedEditDistance(String source, String target) {

        ImmutableList<String> sourceTerms = NamingConventions.splitToLowercaseTerms(source);
        ImmutableList<String> targetTerms = NamingConventions.splitToLowercaseTerms(target);

        // costMatrix[s][t] is the edit distance between source term s and target term t
        double[][] costMatrix = sourceTerms.stream()
                .map(s -> targetTerms.stream().mapToDouble(t -> editDistanceFn.apply(s, t)).toArray())
                .toArray(double[][]::new);

        // worstCaseMatrix[s][t] is the worst case distance between source term s and target term t
        double[][] worstCaseMatrix = sourceTerms.stream().map(s -> s.length()).map(s -> targetTerms.stream()
                .map(t -> t.length()).mapToDouble(t -> maxDistanceFn.apply(s, t)).toArray())
                .toArray(double[][]::new);

        double[] sourceTermDeletionCosts = sourceTerms.stream().mapToDouble(s -> maxDistanceFn.apply(s.length(), 0))
                .toArray();

        double[] targetTermAdditionCosts = targetTerms.stream().mapToDouble(s -> maxDistanceFn.apply(0, s.length()))
                .toArray();

        // this is an array of assignments of source terms to target terms. If assignments[i] contains
        // the value j this means that source term i has been assigned to target term j
        // There will be one entry in cost for each source term:
        // - If there are more source terms than target terms then some will be unassigned - value -1
        // - If there are a fewer source terms than target terms then some target terms will not be
        //    referenced in the array
        int[] assignments = new HungarianAlgorithm(costMatrix).execute();
        double assignmentCost = computeCost(assignments, costMatrix, sourceTermDeletionCosts,
                targetTermAdditionCosts);

        double maxCost = computeCost(assignments, worstCaseMatrix, sourceTermDeletionCosts,
                targetTermAdditionCosts);

        return assignmentCost / maxCost;
    }

    /**
     * Compute the total cost of this assignment including the costs of unassigned source and target
     * terms.
     */
    private static double computeCost(int[] assignments, double[][] costMatrix, double[] sourceTermDeletionCosts,
            double[] targetTermDeletionCosts) {

        // We need to sum the costs of each assigned pair, each unassigned source term, and each
        // unassigned target term.

        // Start with the total cost of _not_ using all the target terms, then when we use one we'll
        // remove it from this total.
        double totalCost = DoubleStream.of(targetTermDeletionCosts).sum();
        for (int sourceTermIndex = 0; sourceTermIndex < assignments.length; sourceTermIndex++) {
            int targetTermIndex = assignments[sourceTermIndex];
            if (targetTermIndex == -1) {
                // not using this source term
                totalCost += sourceTermDeletionCosts[sourceTermIndex];
            } else {
                // add the cost of the assignments
                totalCost += costMatrix[sourceTermIndex][targetTermIndex];

                // we are using this target term and so we should remove the cost of deleting it
                totalCost -= targetTermDeletionCosts[targetTermIndex];
            }
        }
        return totalCost;
    }
}