List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(String s, String t)
Find the Levenshtein distance between two Strings.
From source file:org.openmrs.module.muzima.handler.JsonRegistrationQueueDataHandler.java
private Patient findPatient(final List<Patient> patients, final Patient unsavedPatient) { for (Patient patient : patients) { // match it using the person name and gender, what about the dob? PersonName savedPersonName = patient.getPersonName(); PersonName unsavedPersonName = unsavedPatient.getPersonName(); if (StringUtils.isNotBlank(savedPersonName.getFullName()) && StringUtils.isNotBlank(unsavedPersonName.getFullName())) { if (StringUtils.equalsIgnoreCase(patient.getGender(), unsavedPatient.getGender())) { if (patient.getBirthdate() != null && unsavedPatient.getBirthdate() != null && DateUtils.isSameDay(patient.getBirthdate(), unsavedPatient.getBirthdate())) { String savedGivenName = savedPersonName.getGivenName(); String unsavedGivenName = unsavedPersonName.getGivenName(); int givenNameEditDistance = StringUtils.getLevenshteinDistance( StringUtils.lowerCase(savedGivenName), StringUtils.lowerCase(unsavedGivenName)); String savedFamilyName = savedPersonName.getFamilyName(); String unsavedFamilyName = unsavedPersonName.getFamilyName(); int familyNameEditDistance = StringUtils.getLevenshteinDistance( StringUtils.lowerCase(savedFamilyName), StringUtils.lowerCase(unsavedFamilyName)); if (givenNameEditDistance < 3 && familyNameEditDistance < 3) { return patient; }/*ww w. j a va2s . com*/ } } } } return null; }
From source file:org.openmrs.module.muzimaregistration.handler.RegistrationQueueDataHandler.java
private Patient findPatient(final List<Patient> patients, final Patient unsavedPatient) { for (Patient patient : patients) { PatientIdentifier savedIdentifier = patient.getPatientIdentifier(); PatientIdentifier unsavedIdentifier = unsavedPatient.getPatientIdentifier(); if (StringUtils.isNotBlank(savedIdentifier.getIdentifier()) && StringUtils.isNotBlank(unsavedIdentifier.getIdentifier())) { int editDistance = StringUtils.getLevenshteinDistance( StringUtils.lowerCase(savedIdentifier.getIdentifier()), StringUtils.lowerCase(unsavedIdentifier.getIdentifier())); // exact match on the patient identifier, they are the same patient. if (editDistance == 0) { return patient; }//w w w.j av a 2 s . c o m } // match it using the person name and gender, what about the dob? PersonName savedPersonName = patient.getPersonName(); PersonName unsavedPersonName = unsavedPatient.getPersonName(); if (StringUtils.isNotBlank(savedPersonName.getFullName()) && StringUtils.isNotBlank(unsavedPersonName.getFullName())) { if (StringUtils.equalsIgnoreCase(patient.getGender(), unsavedPatient.getGender())) { String savedGivenName = savedPersonName.getGivenName(); String unsavedGivenName = unsavedPersonName.getGivenName(); int givenNameEditDistance = StringUtils.getLevenshteinDistance( StringUtils.lowerCase(savedGivenName), StringUtils.lowerCase(unsavedGivenName)); String savedFamilyName = savedPersonName.getFamilyName(); String unsavedFamilyName = unsavedPersonName.getFamilyName(); int familyNameEditDistance = StringUtils.getLevenshteinDistance( StringUtils.lowerCase(savedFamilyName), StringUtils.lowerCase(unsavedFamilyName)); if (givenNameEditDistance < 3 && familyNameEditDistance < 3) { return patient; } } } } return null; }
From source file:org.opensextant.extractors.geo.PlaceCandidate.java
/** * Produce a goodness score in the range 0 to 1.0 * //from ww w. j a va 2 s . c om * Trivial examples of name matching: * * <pre> * given some patterns, 'geo' match Text * * case 1. 'Alberta' matches ALBERTA or alberta just fine. * case 2. 'La' matches LA, however, knowing "LA" is a acronym/abbreviation * adds to the score of any geo that actually is "LA" * case 3. 'Afghanestan' matches Afghanistan, but decrement because it is not perfectly spelled. * * </pre> * * @param g * @return */ protected double scoreName(Place g) { int startingScore = getTextnorm().length(); int editDist = StringUtils.getLevenshteinDistance(getTextnorm(), g.getNamenorm()); int score = startingScore - editDist; if (isUpper() && (g.isAbbreviation() || TextUtils.isUpper(g.getName()))) { ++score; } // Mismatch in name diacritics downgrades name score here. if ((isASCII() && !g.isASCIIName()) || (!isASCII() && g.isASCIIName())) { --score; } if (isASCII() && g.isASCIIName()) { ++score; } return (float) score / startingScore; }
From source file:org.oscarehr.match.vacancy.VacancyTemplateData.java
public int matches(String value) { if (this.weight == 0) { this.weight = 1; }/*from ww w. j a va 2 s. c o m*/ if (GENDER.equalsIgnoreCase(param)) { if (value != null) { for (String gender : transaGender) { if (value.toLowerCase().contains(gender)) { return 100; } } } } if (this.range) { if (ranges.isEmpty()) { return 100; } if (!StringUtils.isNumeric(value)) { for (Range range : ranges) { for (String rangeString : range.rangeString) { if (value.contains(rangeString) || rangeString.contains(value)) { return 100; } } } } Integer val = null; try { val = Integer.valueOf(value); } catch (Exception e) { logger.error("Error", e); } if (val != null) { for (Range rangeVal : ranges) { if (rangeVal.isInRange(val)) { return 100; } } } else { return 0; } } String valueToMatch = value; if (valueToMatch == null) { return 0; } if (values.isEmpty()) { return 100; } TreeSet<Integer> weights = new TreeSet<Integer>(); for (String val : values) { if (val == null) { if (values.size() == 1) { return 100; } weights.add(MAX_WEIGHT); } else { int distance = StringUtils.getLevenshteinDistance(val.toLowerCase(), valueToMatch.toLowerCase()); int calculatedWeight = 100 / (distance == 0 ? 1 : distance); weights.add(calculatedWeight); } } return weights.last(); }
From source file:org.pentaho.di.core.row.ValueDataUtil.java
/** * Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source * string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions * required to transform s into t.//from www . ja v a2 s.co m */ public static Long getLevenshtein_Distance(ValueMetaInterface metaA, Object dataA, ValueMetaInterface metaB, Object dataB) { if (dataA == null || dataB == null) { return null; } return new Long(StringUtils.getLevenshteinDistance(dataA.toString(), dataB.toString())); }
From source file:org.pentaho.di.trans.steps.fuzzymatch.FuzzyMatch.java
private Object[] doDistance(Object[] row) throws KettleValueException { // Reserve room Object[] rowData = buildEmptyRow(); Iterator<Object[]> it = data.look.iterator(); long distance = -1; // Object o=row[data.indexOfMainField]; String lookupvalue = getInputRowMeta().getString(row, data.indexOfMainField); while (it.hasNext()) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; int cdistance = -1; String usecacheValue = cacheValue; String uselookupvalue = lookupvalue; if (!meta.isCaseSensitive()) { usecacheValue = cacheValue.toLowerCase(); uselookupvalue = lookupvalue.toLowerCase(); }/* w w w .j av a 2 s .c om*/ switch (meta.getAlgorithmType()) { case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: cdistance = Utils.getDamerauLevenshteinDistance(usecacheValue, uselookupvalue); break; case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: cdistance = Math.abs((int) new NeedlemanWunsch().score(usecacheValue, uselookupvalue)); break; default: cdistance = StringUtils.getLevenshteinDistance(usecacheValue, uselookupvalue); break; } if (data.minimalDistance <= cdistance && cdistance <= data.maximalDistance) { if (meta.isGetCloserValue()) { if (cdistance < distance || distance == -1) { // Get closer value // minimal distance distance = cdistance; int index = 0; rowData[index++] = cacheValue; // Add metric value? if (data.addValueFieldName) { rowData[index++] = distance; } // Add additional return values? if (data.addAdditionalFields) { for (int i = 0; i < meta.getValue().length; i++) { int nr = i + 1; int nf = i + index; rowData[nf] = cachedData[nr]; } } } } else { // get all values separated by values separator if (rowData[0] == null) { rowData[0] = cacheValue; } else { rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue; } } } } return rowData; }
From source file:org.sonar.java.checks.JunitMethodDeclarationCheck.java
@VisibleForTesting protected boolean areVerySimilarStrings(String expected, String actual) { // cut complexity when the strings length difference is bigger than the accepted threshold return (Math.abs(expected.length() - actual.length()) <= MAX_STRING_DISTANCE) && StringUtils.getLevenshteinDistance(expected, actual) < MAX_STRING_DISTANCE; }
From source file:org.talend.dataprep.transformation.actions.text.FuzzyMatching.java
private boolean fuzzyMatches(String value, String reference, int sensitivity) { int levenshteinDistance = StringUtils.getLevenshteinDistance(value, reference); return levenshteinDistance <= sensitivity; }
From source file:org.talend.dataquality.record.linkage.attribute.LevenshteinMatcher.java
public double getWeight(String str1, String str2) { // get the max possible levenstein distance score for string int maxLen = Math.max(str1.length(), str2.length()); // check for 0 maxLen if (maxLen == 0) { return 1.0; // as both strings identically zero length } else {/* w ww .j a v a2 s . c o m*/ final int levenshteinDistance = StringUtils.getLevenshteinDistance(str1, str2); // return actual / possible levenstein distance to get 0-1 range return 1.0 - ((double) levenshteinDistance / maxLen); } }
From source file:org.vivoweb.harvester.score.algorithm.NormalizedLevenshteinDifference.java
@Override public float calculate(CharSequence itemX, CharSequence itemY) { if (itemX == null) { throw new IllegalArgumentException("x cannot be null"); }// w w w. j a v a 2 s . c o m if (itemY == null) { throw new IllegalArgumentException("y cannot be null"); } float maxSize = Math.max(itemX.length(), itemY.length()) / 1f; if (maxSize == 0f) { return 0f; } return ((maxSize - StringUtils.getLevenshteinDistance(itemX.toString(), itemY.toString())) / maxSize); }