de.cubeisland.engine.core.util.matcher.StringMatcher.java Source code

Java tutorial

Introduction

Here is the source code for de.cubeisland.engine.core.util.matcher.StringMatcher.java

Source

/**
 * This file is part of CubeEngine.
 * CubeEngine is licensed under the GNU General Public License Version 3.
 *
 * CubeEngine is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CubeEngine is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CubeEngine.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.cubeisland.engine.core.util.matcher;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import com.google.common.base.Functions;
import com.google.common.collect.Ordering;
import de.cubeisland.engine.core.CubeEngine;

public class StringMatcher {
    private final DamerauLevenshteinAlgorithm editDistance = new DamerauLevenshteinAlgorithm(1, 1, 1, 1);

    /**
     * Returns all matches with their editDistance, having an editDistance <= maxDistance
     *
     * @param search the String to search for
     * @param in the Strings to match in
     * @param maxDistance the maximum editDistance
     * @param ignoreCase
     * @return a map of all matches sorted by their editDistance
     */
    public TreeMap<String, Integer> getMatches(String search, Collection<String> in, int maxDistance,
            boolean ignoreCase) {
        if (maxDistance < 1) {
            CubeEngine.getLog().warn(new Throwable(), "Checking EditDistance lower than 1!");
            return new TreeMap<>();
        }
        Map<String, Integer> matches = new HashMap<>();
        Ordering<String> comparator = Ordering.natural().onResultOf(Functions.forMap(matches))
                .compound(Ordering.natural());
        for (String target : in) {
            int distance = target.length() - search.length();
            if (distance > maxDistance || -distance > maxDistance) // too long/short to match
            {
                continue;
            }
            if (ignoreCase) {
                distance = this.editDistance.executeIgnoreCase(search, target);
            } else {
                distance = this.editDistance.execute(search, target);
            }
            if (distance <= maxDistance) {
                matches.put(target, distance);
            }
        }
        TreeMap<String, Integer> result = new TreeMap<>(comparator);
        result.putAll(matches);
        return result;
    }

    public String matchString(String search, Collection<String> in, boolean ignoreCase, int firstEditDistance,
            double firstMinPercentCorrect, int maxIndex, double indexMinPercentCorrect, int secondEditDistance,
            double secondMinPercentCorrect) {
        if (search == null || in == null || in.isEmpty()) {
            return null;
        }
        if (in.contains(search)) {
            return search; // Direct Match
        }
        Map<String, Integer> firstEditDistanceCheck = this.getMatches(search, in, firstEditDistance, ignoreCase);
        int searchStringLength = search.length();
        if (!firstEditDistanceCheck.isEmpty()) // Nothing found with EditDistance -> Check for starting strings
        {
            for (Map.Entry<String, Integer> entry : firstEditDistanceCheck.entrySet()) {
                double curPercentage = (entry.getKey().length() - entry.getValue()) * 100 / entry.getKey().length();
                if (curPercentage >= firstMinPercentCorrect) {
                    CubeEngine.getLog().debug("1stDist FOUND: {} for {} (D:{}|{}%)",
                            firstEditDistanceCheck.keySet().iterator().next(), search,
                            firstEditDistanceCheck.values().iterator().next(), curPercentage);
                    return entry.getKey();
                }
                CubeEngine.getLog().debug("1stDist TO WEAK: {} for {} (D:{}|{}%)",
                        firstEditDistanceCheck.keySet().iterator().next(), search,
                        firstEditDistanceCheck.values().iterator().next(), curPercentage);
            }
        }
        String bestMatch = null;

        if (ignoreCase) {
            search = search.toLowerCase();
        }
        if (maxIndex >= 0) // Index lower than 0 -> NO CHECK
        {
            int bestIndex = maxIndex;
            int currentIndex;
            double bestPercentCorrect = indexMinPercentCorrect;
            for (String inList : in) {
                if (ignoreCase) {
                    currentIndex = inList.toLowerCase(Locale.ENGLISH).indexOf(search);
                } else {
                    currentIndex = inList.indexOf(search);
                }
                if (currentIndex != -1) // Found search in inList
                {
                    double curPercentCorrect = searchStringLength * 100 / inList.length();
                    if (currentIndex < bestIndex && curPercentCorrect >= indexMinPercentCorrect) // Compare to last match
                    {
                        CubeEngine.getLog().debug("Index: FOUND {} for {} (I:{}|{}%)", inList, search, currentIndex,
                                curPercentCorrect);
                        bestIndex = currentIndex;
                        bestPercentCorrect = curPercentCorrect;
                        bestMatch = inList;
                    } else if (currentIndex == bestIndex && curPercentCorrect >= bestPercentCorrect) {
                        CubeEngine.getLog().debug("Index: FOUND {} for {} (I:{}|{}%)", inList, search, currentIndex,
                                curPercentCorrect);
                        bestPercentCorrect = curPercentCorrect;
                        bestMatch = inList;
                    }
                }
            }
        }
        if (bestMatch != null || secondEditDistance < 1 || searchStringLength < 3) // found OR do not check secondEditDistance OR searchString shorter than 3
        {
            return bestMatch;
        }
        // Still not found -> search for starting strings with typo
        int bestDistance = secondEditDistance + 1;
        double bestPercentCorrect = secondMinPercentCorrect;
        for (String inList : in) {
            if (inList.length() >= searchStringLength + secondEditDistance) // can inList contain searchString?
            {
                String subString = inList.substring(0, searchStringLength + secondEditDistance);
                if (ignoreCase) {
                    subString = subString.toLowerCase(Locale.ENGLISH);
                }
                int currentDistance = this.editDistance.execute(search, subString);
                double curPercentCorrect = (searchStringLength - currentDistance) * 100 / inList.length();
                if (currentDistance < bestDistance && curPercentCorrect >= secondMinPercentCorrect) // Found light Typo at start of String
                {
                    CubeEngine.getLog().debug("2nDist: Found {}|{} for {} (D:{}|{}%)", inList, subString, search,
                            currentDistance, (int) curPercentCorrect);
                    bestDistance = currentDistance;
                    bestMatch = inList;
                    bestPercentCorrect = curPercentCorrect;
                } else if (currentDistance == bestDistance && bestPercentCorrect <= curPercentCorrect) {
                    CubeEngine.getLog().debug("2nDist: Found {}|{} for {} (D:{}|{}%)", inList, subString, search,
                            currentDistance, (int) curPercentCorrect);
                    bestMatch = inList;
                    bestPercentCorrect = curPercentCorrect;
                }
            }
        }
        return bestMatch;
    }

    /**
     * Gets the best matches with given maxEditDistance ignoring case
     *
     * @param search the string to match
     * @param in the collection to match in
     * @param maxEditDistance the maximum editDistance
     * @return a sorted set of the best matches
     */
    public Set<String> getBestMatches(String search, Collection<String> in, int maxEditDistance) {
        return this.getMatches(search, in, maxEditDistance, true).keySet();
    }

    public String matchString(String search, Collection<String> in) {
        return this.matchString(search, in, true, 1, 40, 3, 10, 1, 40);
    }

    public String matchString(String search, String... in) {
        return this.matchString(search, Arrays.asList(in));
    }
}