Example usage for org.apache.commons.lang StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(String s, String t) 

Source Link

Document

Find the Levenshtein distance between two Strings.

Usage

From source file:org.openmrs.module.muzima.handler.JsonRegistrationQueueDataHandler.java

private Patient findPatient(final List<Patient> patients, final Patient unsavedPatient) {
    for (Patient patient : patients) {
        // match it using the person name and gender, what about the dob?
        PersonName savedPersonName = patient.getPersonName();
        PersonName unsavedPersonName = unsavedPatient.getPersonName();
        if (StringUtils.isNotBlank(savedPersonName.getFullName())
                && StringUtils.isNotBlank(unsavedPersonName.getFullName())) {
            if (StringUtils.equalsIgnoreCase(patient.getGender(), unsavedPatient.getGender())) {
                if (patient.getBirthdate() != null && unsavedPatient.getBirthdate() != null
                        && DateUtils.isSameDay(patient.getBirthdate(), unsavedPatient.getBirthdate())) {
                    String savedGivenName = savedPersonName.getGivenName();
                    String unsavedGivenName = unsavedPersonName.getGivenName();
                    int givenNameEditDistance = StringUtils.getLevenshteinDistance(
                            StringUtils.lowerCase(savedGivenName), StringUtils.lowerCase(unsavedGivenName));
                    String savedFamilyName = savedPersonName.getFamilyName();
                    String unsavedFamilyName = unsavedPersonName.getFamilyName();
                    int familyNameEditDistance = StringUtils.getLevenshteinDistance(
                            StringUtils.lowerCase(savedFamilyName), StringUtils.lowerCase(unsavedFamilyName));
                    if (givenNameEditDistance < 3 && familyNameEditDistance < 3) {
                        return patient;
                    }/*ww w.  j  a  va2s  . com*/
                }
            }
        }
    }
    return null;
}

From source file:org.openmrs.module.muzimaregistration.handler.RegistrationQueueDataHandler.java

private Patient findPatient(final List<Patient> patients, final Patient unsavedPatient) {
    for (Patient patient : patients) {
        PatientIdentifier savedIdentifier = patient.getPatientIdentifier();
        PatientIdentifier unsavedIdentifier = unsavedPatient.getPatientIdentifier();
        if (StringUtils.isNotBlank(savedIdentifier.getIdentifier())
                && StringUtils.isNotBlank(unsavedIdentifier.getIdentifier())) {
            int editDistance = StringUtils.getLevenshteinDistance(
                    StringUtils.lowerCase(savedIdentifier.getIdentifier()),
                    StringUtils.lowerCase(unsavedIdentifier.getIdentifier()));
            // exact match on the patient identifier, they are the same patient.
            if (editDistance == 0) {
                return patient;
            }//w  w  w.j av  a 2  s  .  c o m
        }
        // match it using the person name and gender, what about the dob?
        PersonName savedPersonName = patient.getPersonName();
        PersonName unsavedPersonName = unsavedPatient.getPersonName();
        if (StringUtils.isNotBlank(savedPersonName.getFullName())
                && StringUtils.isNotBlank(unsavedPersonName.getFullName())) {
            if (StringUtils.equalsIgnoreCase(patient.getGender(), unsavedPatient.getGender())) {
                String savedGivenName = savedPersonName.getGivenName();
                String unsavedGivenName = unsavedPersonName.getGivenName();
                int givenNameEditDistance = StringUtils.getLevenshteinDistance(
                        StringUtils.lowerCase(savedGivenName), StringUtils.lowerCase(unsavedGivenName));
                String savedFamilyName = savedPersonName.getFamilyName();
                String unsavedFamilyName = unsavedPersonName.getFamilyName();
                int familyNameEditDistance = StringUtils.getLevenshteinDistance(
                        StringUtils.lowerCase(savedFamilyName), StringUtils.lowerCase(unsavedFamilyName));
                if (givenNameEditDistance < 3 && familyNameEditDistance < 3) {
                    return patient;
                }
            }
        }
    }
    return null;
}

From source file:org.opensextant.extractors.geo.PlaceCandidate.java

/**
 * Produce a goodness score in the range 0 to 1.0
 * //from  ww w.  j  a  va  2  s .  c om
 * Trivial examples of name matching:
 * 
 * <pre>
 *  given some patterns, 'geo' match Text
 * 
 *   case 1. 'Alberta' matches ALBERTA or alberta just fine. 
 *   case 2. 'La' matches LA, however, knowing "LA" is a acronym/abbreviation 
 *       adds to the score of any geo that actually is "LA"
 *   case 3. 'Afghanestan' matches Afghanistan, but decrement because it is not perfectly spelled.
 * 
 * </pre>
 * 
 * @param g
 * @return
 */
protected double scoreName(Place g) {
    int startingScore = getTextnorm().length();
    int editDist = StringUtils.getLevenshteinDistance(getTextnorm(), g.getNamenorm());
    int score = startingScore - editDist;
    if (isUpper() && (g.isAbbreviation() || TextUtils.isUpper(g.getName()))) {
        ++score;
    }
    // Mismatch in name diacritics downgrades name score here.
    if ((isASCII() && !g.isASCIIName()) || (!isASCII() && g.isASCIIName())) {
        --score;
    }
    if (isASCII() && g.isASCIIName()) {
        ++score;
    }
    return (float) score / startingScore;
}

From source file:org.oscarehr.match.vacancy.VacancyTemplateData.java

public int matches(String value) {
    if (this.weight == 0) {
        this.weight = 1;
    }/*from  ww w. j  a  va  2  s.  c o  m*/
    if (GENDER.equalsIgnoreCase(param)) {
        if (value != null) {
            for (String gender : transaGender) {
                if (value.toLowerCase().contains(gender)) {
                    return 100;
                }
            }
        }
    }
    if (this.range) {
        if (ranges.isEmpty()) {
            return 100;
        }
        if (!StringUtils.isNumeric(value)) {
            for (Range range : ranges) {
                for (String rangeString : range.rangeString) {
                    if (value.contains(rangeString) || rangeString.contains(value)) {
                        return 100;
                    }
                }
            }
        }
        Integer val = null;
        try {
            val = Integer.valueOf(value);
        } catch (Exception e) {
            logger.error("Error", e);
        }
        if (val != null) {
            for (Range rangeVal : ranges) {
                if (rangeVal.isInRange(val)) {
                    return 100;
                }
            }
        } else {
            return 0;
        }
    }
    String valueToMatch = value;
    if (valueToMatch == null) {
        return 0;
    }
    if (values.isEmpty()) {
        return 100;
    }
    TreeSet<Integer> weights = new TreeSet<Integer>();
    for (String val : values) {
        if (val == null) {
            if (values.size() == 1) {
                return 100;
            }
            weights.add(MAX_WEIGHT);
        } else {
            int distance = StringUtils.getLevenshteinDistance(val.toLowerCase(), valueToMatch.toLowerCase());
            int calculatedWeight = 100 / (distance == 0 ? 1 : distance);
            weights.add(calculatedWeight);
        }
    }
    return weights.last();
}

From source file:org.pentaho.di.core.row.ValueDataUtil.java

/**
 * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source
 * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions
 * required to transform s into t.//from  www . ja v  a2 s.co  m
 */
public static Long getLevenshtein_Distance(ValueMetaInterface metaA, Object dataA, ValueMetaInterface metaB,
        Object dataB) {
    if (dataA == null || dataB == null) {
        return null;
    }
    return new Long(StringUtils.getLevenshteinDistance(dataA.toString(), dataB.toString()));
}

From source file:org.pentaho.di.trans.steps.fuzzymatch.FuzzyMatch.java

private Object[] doDistance(Object[] row) throws KettleValueException {
    // Reserve room
    Object[] rowData = buildEmptyRow();

    Iterator<Object[]> it = data.look.iterator();

    long distance = -1;

    // Object o=row[data.indexOfMainField];
    String lookupvalue = getInputRowMeta().getString(row, data.indexOfMainField);

    while (it.hasNext()) {
        // Get cached row data
        Object[] cachedData = it.next();
        // Key value is the first value
        String cacheValue = (String) cachedData[0];

        int cdistance = -1;
        String usecacheValue = cacheValue;
        String uselookupvalue = lookupvalue;
        if (!meta.isCaseSensitive()) {
            usecacheValue = cacheValue.toLowerCase();
            uselookupvalue = lookupvalue.toLowerCase();
        }/* w w w .j  av a  2  s .c  om*/

        switch (meta.getAlgorithmType()) {
        case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN:
            cdistance = Utils.getDamerauLevenshteinDistance(usecacheValue, uselookupvalue);
            break;
        case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH:
            cdistance = Math.abs((int) new NeedlemanWunsch().score(usecacheValue, uselookupvalue));
            break;
        default:
            cdistance = StringUtils.getLevenshteinDistance(usecacheValue, uselookupvalue);
            break;
        }

        if (data.minimalDistance <= cdistance && cdistance <= data.maximalDistance) {
            if (meta.isGetCloserValue()) {
                if (cdistance < distance || distance == -1) {
                    // Get closer value
                    // minimal distance
                    distance = cdistance;
                    int index = 0;
                    rowData[index++] = cacheValue;
                    // Add metric value?
                    if (data.addValueFieldName) {
                        rowData[index++] = distance;
                    }
                    // Add additional return values?
                    if (data.addAdditionalFields) {
                        for (int i = 0; i < meta.getValue().length; i++) {
                            int nr = i + 1;
                            int nf = i + index;
                            rowData[nf] = cachedData[nr];
                        }
                    }
                }
            } else {
                // get all values separated by values separator
                if (rowData[0] == null) {
                    rowData[0] = cacheValue;
                } else {
                    rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue;
                }
            }
        }
    }

    return rowData;
}

From source file:org.sonar.java.checks.JunitMethodDeclarationCheck.java

@VisibleForTesting
protected boolean areVerySimilarStrings(String expected, String actual) {
    // cut complexity when the strings length difference is bigger than the accepted threshold
    return (Math.abs(expected.length() - actual.length()) <= MAX_STRING_DISTANCE)
            && StringUtils.getLevenshteinDistance(expected, actual) < MAX_STRING_DISTANCE;
}

From source file:org.talend.dataprep.transformation.actions.text.FuzzyMatching.java

private boolean fuzzyMatches(String value, String reference, int sensitivity) {
    int levenshteinDistance = StringUtils.getLevenshteinDistance(value, reference);
    return levenshteinDistance <= sensitivity;
}

From source file:org.talend.dataquality.record.linkage.attribute.LevenshteinMatcher.java

public double getWeight(String str1, String str2) {
    // get the max possible levenstein distance score for string
    int maxLen = Math.max(str1.length(), str2.length());

    // check for 0 maxLen
    if (maxLen == 0) {
        return 1.0; // as both strings identically zero length
    } else {/*  w ww .j  a v a2  s . c o m*/
        final int levenshteinDistance = StringUtils.getLevenshteinDistance(str1, str2);
        // return actual / possible levenstein distance to get 0-1 range
        return 1.0 - ((double) levenshteinDistance / maxLen);
    }
}

From source file:org.vivoweb.harvester.score.algorithm.NormalizedLevenshteinDifference.java

@Override
public float calculate(CharSequence itemX, CharSequence itemY) {
    if (itemX == null) {
        throw new IllegalArgumentException("x cannot be null");
    }// w  w w.  j  a v a 2  s  . c o  m
    if (itemY == null) {
        throw new IllegalArgumentException("y cannot be null");
    }

    float maxSize = Math.max(itemX.length(), itemY.length()) / 1f;
    if (maxSize == 0f) {
        return 0f;
    }
    return ((maxSize - StringUtils.getLevenshteinDistance(itemX.toString(), itemY.toString())) / maxSize);
}