Example usage for org.apache.commons.lang StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(String s, String t) 

Source Link

Document

Find the Levenshtein distance between two Strings.

Usage

From source file:edu.ku.brc.specify.toycode.mexconabio.MexConvToSQL.java

/**
 * @param str1//from  w  w  w .j a  v a 2s.  c o m
 * @param str2
 * @param usePeriods
 * @return
 */
private double check(final String str1, final String str2, final boolean usePeriods) {
    if (str1 == null || str2 == null)
        return 0.0;

    String longStr;
    String shortStr;

    if (str1.length() > str2.length()) {
        longStr = str1;
        shortStr = str2;
    } else {
        longStr = str2;
        shortStr = str1;
    }

    longStr = clean(longStr, usePeriods);
    shortStr = clean(shortStr, usePeriods);

    int score = 0;
    String[] longToks = StringUtils.split(longStr, " ");
    String[] shortToks = StringUtils.split(shortStr, " ");
    for (String sStr : shortToks) {
        for (String lStr : longToks) {
            if (lStr.equals(sStr) || (sStr.length() > 2 && StringUtils.getLevenshteinDistance(lStr, sStr) < 3))
                score += 1;
        }
    }

    double rating = (double) score / (double) shortToks.length * 100.0;
    //System.out.println(" L: "+shortToks.length+"  Sc: "+score+"  Rating: "+String.format("%5.2f", rating));

    return rating;
}

From source file:di.uniba.it.wsd.RevisedLesk.java

private float computeLDscore(String s1, String s2) {
    float maxLength = (float) Math.max(s1.length(), s2.length());
    float ld = (float) StringUtils.getLevenshteinDistance(s1, s2);
    return 1 - ld / maxLength;
}

From source file:edu.ku.brc.specify.toycode.mexconabio.AnalysisBase.java

/**
 * @param refRow/*from w  w w  . j  a v  a  2s. c  o m*/
 * @param compareRow
 * @return
 */
public int score(final Object[] refRow, final Object[] compareRow) {
    //Calendar cal = Calendar.getInstance();

    if (maxScore == 0) {
        calcMaxScore();
    }

    //String refCatNum       = (String)refRow[CATNUM_INX];
    String refCollectorNum = (String) refRow[COLNUM_INX];
    String refGenus = (String) refRow[GENUS_INX];
    String refSpecies = (String) refRow[SPECIES_INX];
    String refSubSpecies = (String) refRow[SUBSPECIES_INX];
    String refCollector = (String) refRow[COLLECTOR_INX];
    String refLocality = (String) refRow[LOCALITY_INX];
    //String refLatitude     = (String)refRow[LATITUDE_INX];
    //String refLongitude    = (String)refRow[LONGITUDE_INX];
    String refYear = (String) refRow[YEAR_INX];
    String refMon = (String) refRow[MON_INX];
    String refDay = (String) refRow[DAY_INX];
    String refCountry = (String) refRow[COUNTRY_INX];
    String refState = (String) refRow[STATE_INX];

    //String catNum       = (String)compareRow[CATNUM_INX];
    String collectorNum = (String) compareRow[COLNUM_INX];
    String genus = (String) compareRow[GENUS_INX];
    String species = (String) compareRow[SPECIES_INX];
    String subSpecies = (String) compareRow[SUBSPECIES_INX];
    String collector = (String) compareRow[COLLECTOR_INX];
    String locality = (String) compareRow[LOCALITY_INX];
    //String latitude     = (String)compareRow[LATITUDE_INX];
    //String longitude    = (String)compareRow[LONGITUDE_INX];
    String year = (String) compareRow[YEAR_INX];
    String mon = (String) compareRow[MON_INX];
    String day = (String) compareRow[DAY_INX];
    String country = (String) compareRow[COUNTRY_INX];
    String state = (String) compareRow[STATE_INX];

    if (country != null && country.toLowerCase().equals("mexico")) {
        country = "Mxico";
    }

    /*
    String refCollector = null;
    switch (numFieldForName)
    {
    case 1 :
        refLastNameF  = getStr(gRS, inx++);
        refCollector  = refLastNameF;
        break;
                
    case 2 :
        refLastNameF  = getStr(gRS, inx++);
        refFirstName  = getStr(gRS, inx++);
        refCollector = String.format("%s %s", (StringUtils.isNotEmpty(refLastNameF) ? refLastNameF : ""), (StringUtils.isNotEmpty(refFirstName) ? refFirstName : "")).trim();
        break;
                
    case 3 :
        refLastNameF  = getStr(gRS, inx++);
        refLastNameM  = getStr(gRS, inx++);
        refFirstName  = getStr(gRS, inx++);
        refCollector = String.format("%s %s %s", (StringUtils.isNotEmpty(refLastNameF) ? refLastNameF : ""), (StringUtils.isNotEmpty(refLastNameM) ? refLastNameM : ""), (StringUtils.isNotEmpty(refFirstName) ? refFirstName : "")).trim();
        break;
    }
    */

    if (refCountry != null && refCountry.toLowerCase().equals("mexico")) {
        refCountry = "Mxico";
    }

    if (StringUtils.isNotEmpty(refCollector)) {
        refCollector = StringUtils.replaceChars(refCollector, ',', ' ');
    }
    if (StringUtils.isNotEmpty(collector)) {
        collector = StringUtils.replaceChars(collector, ',', ' ');
    }

    int score = 0;

    score += compareDate(year, mon, day, refYear, refMon, refDay);

    String refCollectorStr = refCollector != null ? refCollector.toLowerCase() : "";
    String collectorStr = collector != null ? collector.toLowerCase() : "";
    String refLocStr = refLocality != null ? refLocality.toLowerCase() : "";
    String locStr = locality != null ? locality.toLowerCase() : "";

    double ratingLoc = longStringCompare(refLocStr, locStr, false, true);
    double ratingColtr = longStringCompare(refCollectorStr, collectorStr, false, true);

    if (ratingLoc > 50.0) {
        score += 10;
        tdColorCodes[LOCALITY_INX] = BGR;

    } else if (ratingLoc > 30.0) {
        score += 6;
        tdColorCodes[LOCALITY_INX] = GR;

    } else if (ratingLoc > 0.0) {
        score += 3;
        tdColorCodes[LOCALITY_INX] = YW;
    }

    if (ratingColtr > 50.0) {
        score += 10;
        tdColorCodes[COLLECTOR_INX] = BGR;

    } else if (ratingColtr > 30.0) {
        score += 6;
        tdColorCodes[COLLECTOR_INX] = GR;

    } else if (ratingColtr > 0.0) {
        score += 3;
        tdColorCodes[COLLECTOR_INX] = YW;
    }

    boolean genusMatches = false;
    if (refGenus != null && genus != null) {
        if (refGenus.equals(genus)) {
            score += 15;
            genusMatches = true;
            tdColorCodes[GENUS_INX] = GR;

        } else if (StringUtils.getLevenshteinDistance(genus, refGenus) < 3) {
            score += 7;
            tdColorCodes[GENUS_INX] = YW;
        }
    }

    boolean speciesMatches = false;
    if (refSpecies != null && species != null) {
        if (refSpecies.equals(species)) {
            score += 20;
            speciesMatches = true;
            if (genusMatches) {
                tdColorCodes[GENUS_INX] = BGR;
                tdColorCodes[SPECIES_INX] = BGR;
            } else {
                tdColorCodes[SPECIES_INX] = GR;
            }
        } else if (StringUtils.getLevenshteinDistance(species, refSpecies) < 3) {
            score += 10;
            tdColorCodes[SPECIES_INX] = YW;
        }
    }

    if (refSubSpecies != null && subSpecies != null) {
        if (refSubSpecies.equals(subSpecies)) {
            score += 20;
            if (genusMatches && speciesMatches) {
                tdColorCodes[GENUS_INX] = BGR;
                tdColorCodes[SPECIES_INX] = BGR;
                tdColorCodes[SUBSPECIES_INX] = BGR;

            } else if (speciesMatches) {
                tdColorCodes[SPECIES_INX] = BGR;
                tdColorCodes[SUBSPECIES_INX] = BGR;
            } else {
                tdColorCodes[SUBSPECIES_INX] = GR;
            }

        } else if (StringUtils.getLevenshteinDistance(subSpecies, refSubSpecies) < 3) {
            score += 10;
            tdColorCodes[SUBSPECIES_INX] = YW;
        }
    }

    if (refCountry != null && country != null) {
        refCountry = refCountry.toLowerCase();
        country = country.toLowerCase();

        if (refCountry.equals(country)) {
            score += 10;
            tdColorCodes[COUNTRY_INX] = BGR;

        } else if (StringUtils.getLevenshteinDistance(country, refCountry) < 3) {
            score += 5;
            tdColorCodes[COUNTRY_INX] = YW;
        }
    }

    if (refState != null && state != null) {
        refState = refState.toLowerCase();
        state = state.toLowerCase();

        if (refState.equals(state)) {
            score += 10;
            tdColorCodes[STATE_INX] = BGR;

        } else if (StringUtils.getLevenshteinDistance(state, refState) < 3) {
            score += 5;
            tdColorCodes[STATE_INX] = YW;
        }
    }

    /*if (refGenus != null && species != null && refGenus.equals(species))     
    {
    cCls[3] = DF;
    cCls[2] = DF;                        
    score += 10;
    }
            
    if (refSpecies != null && genus != null && refSpecies.equals(genus))
    {
    cCls[2] = DF;
    cCls[3] = DF;
    score += 15;
    }*/

    if (collectorNum != null && refCollectorNum != null && collectorNum.equals(refCollectorNum)) {
        score += 10;
        tdColorCodes[COLNUM_INX] = BGR;
    }

    return score;
}

From source file:edu.ku.brc.specify.toycode.mexconabio.AnalysisBase.java

/**
 * @param str1//from   w  w w  . j  av  a 2 s .c o  m
 * @param str2
 * @param usePeriods
 * @return
 */
protected double longStringCompare(final String str1, final String str2, final boolean usePeriods,
        final boolean replaceWithSpaces) {
    if (str1 == null || str2 == null)
        return 0.0;

    String longStr;
    String shortStr;

    if (str1.length() >= str2.length()) {
        longStr = str1;
        shortStr = str2;
    } else {
        longStr = str2;
        shortStr = str1;
    }

    longStr = cleanString(longStr, usePeriods, replaceWithSpaces);
    shortStr = cleanString(shortStr, usePeriods, replaceWithSpaces);

    int score = 0;
    String[] longToks = StringUtils.split(longStr, " ");
    String[] shortToks = StringUtils.split(shortStr, " ");
    for (String sStr : shortToks) {
        for (String lStr : longToks) {
            if (lStr.equals(sStr) || (sStr.length() > 2 && StringUtils.getLevenshteinDistance(lStr, sStr) < 3))
                score += 1;
        }
    }

    double rating = (double) score / (double) shortToks.length * 100.0;
    //System.out.println(" L: "+shortToks.length+"  Sc: "+score+"  Rating: "+String.format("%5.2f", rating));

    return rating;
}

From source file:com.ntua.cosmos.hackathonplanneraas.Planner.java

@Deprecated
private String getPOSTBody(JSONObject obj) {
    String body = "{}";
    StoredPaths pan = new StoredPaths();
    JSONObject returned = new JSONObject();
    ArrayList<String> names = new ArrayList<>();
    ArrayList<String> values = new ArrayList<>();
    String originalString = "";
    double similarity = 0.0, current = 0.0;
    if (!obj.isEmpty()) {
        Set keys = obj.keySet();//  w w w.  j a v a  2s.  c om
        Iterator iter = keys.iterator();
        for (; iter.hasNext();) {
            String temporary = String.valueOf(iter.next());
            if (temporary.startsWith("has"))
                names.add(temporary);
        }
        names.stream().forEach((name) -> {
            values.add(String.valueOf(obj.get(name)));
        });
    }
    originalString = values.stream().map((value) -> value).reduce(originalString, String::concat);
    //Begin the initialisation process.
    OntModelSpec s = new OntModelSpec(PelletReasonerFactory.THE_SPEC);
    //s.setReasoner(PelletReasonerFactory.theInstance().create());
    OntDocumentManager dm = OntDocumentManager.getInstance();
    dm.setFileManager(FileManager.get());
    s.setDocumentManager(dm);
    OntModel m = ModelFactory.createOntologyModel(s, null);
    //OntModel m = ModelFactory.createOntologyModel( PelletReasonerFactory.THE_SPEC );
    InputStream in = FileManager.get().open(StoredPaths.casebasepath);
    if (in == null) {
        throw new IllegalArgumentException("File: " + StoredPaths.casebasepath + " not found");
    }
    // read the file
    m.read(in, null);
    //begin building query string.
    String queryString = pan.prefixrdf + pan.prefixowl + pan.prefixxsd + pan.prefixrdfs + pan.prefixCasePath;
    queryString += "\nSELECT DISTINCT ";
    for (int i = 0; i < names.size(); i++) {
        queryString += "?param" + i + " ";
    }
    queryString += "?message ?handle ?URI ?body WHERE {";
    for (int i = 0; i < names.size(); i++) {
        queryString += "?event base:" + names.get(i) + " ?param" + i + " . ";
    }
    queryString += "?event base:isSolvedBy ?solution . ?solution base:exposesMessage ?message . ?solution base:eventHandledBy ?handle . ?solution base:URI ?URI . ?solution base:hasPOSTBody ?body}";
    try {
        String testString = "";
        Query query = QueryFactory.create(queryString);
        //System.out.println(String.valueOf(query));
        QueryExecution qe = QueryExecutionFactory.create(query, m);
        ResultSet results = qe.execSelect();
        for (; results.hasNext();) {
            QuerySolution soln = results.nextSolution();
            // Access variables: soln.get("x");
            Literal lit;
            for (int i = 0; i < names.size(); i++) {
                lit = soln.getLiteral("param" + i);// Get a result variable by name.
                String temporary = String.valueOf(lit).substring(0, String.valueOf(lit).indexOf("^^"));
                testString += temporary;
            }
            String longer = testString, shorter = originalString;
            if (testString.length() < originalString.length()) { // longer should always have greater length
                longer = originalString;
                shorter = testString;
            }
            int longerLength = longer.length();
            current = (longerLength - StringUtils.getLevenshteinDistance(longer, shorter))
                    / (double) longerLength;
            if (similarity < current) {
                similarity = current;
                returned.clear();
                returned.put("message", soln.getLiteral("message").getString());
                returned.put("URI", soln.getLiteral("URI").getString());
                returned.put("handle", soln.getLiteral("handle").getString());
                body = soln.getLiteral("body").getString();
            }
        }
    } catch (Exception e) {
        System.out.println("Search is interrupted by an error.");
    }
    m.close();
    return body;
}

From source file:au.org.ala.names.search.ALANameSearcher.java

/**
 * Uses the distance between 2 strings to determine whether or not the
 * 2 strings are a close match./*from  ww  w . ja  v  a  2 s. c om*/
 *
 * @param s1
 * @param s2
 * @param maxLengthDif The maximum differences in length that the 2 strings can  be
 * @param maxDist      The maximum distance between the 2 strings
 * @return
 */
private boolean isCloseMatch(String s1, String s2, int maxLengthDif, int maxDist) {
    if (s1 != null && s2 != null && Math.abs(s1.length() - s2.length()) <= maxLengthDif) {
        //if the difference in the length of the 2 strings is at the most maxLengthDif characters compare the L distance
        //log.debug("Difference ("+s1 + ", " + s2+") : " + StringUtils.getLevenshteinDistance(s1, s2));
        return StringUtils.getLevenshteinDistance(s1, s2) <= maxDist;

    }
    return false;
}

From source file:edu.ku.brc.specify.Specify.java

/**
 * //from w w  w.  j av  a 2s . com
 */
protected void localities() {
    Connection connection = DBConnection.getInstance().getConnection();
    Statement stmt = null;
    Statement stmt2 = null;
    try {
        stmt = connection.createStatement();
        stmt2 = connection.createStatement();
        String sql = "SELECT l.LocalityID, l.LocalityName, g.FullName, l.Latitude1, l.Longitude1 FROM locality l INNER JOIN geography g ON l.GeographyID = g.GeographyID";
        ResultSet rs = stmt.executeQuery(sql);
        while (rs.next()) {
            String currLocalityName = rs.getString(2);
            ResultSet rs1 = stmt2.executeQuery(sql + " WHERE g.FullName = \"" + rs.getString(3)
                    + "\" AND l.LocalityID <> " + rs.getInt(1));
            while (rs1.next()) {
                String localityName = rs1.getString(2);
                int distance = StringUtils.getLevenshteinDistance(currLocalityName, localityName);
                //log.error(rs.getInt(1) + "  "+ rs1.getInt(1) + "  "+ distance);
                if (distance < 6) {
                    log.error("----- " + distance + "\n" + currLocalityName + "\n" + localityName);
                    log.error(rs.getBigDecimal(4) + "," + rs.getBigDecimal(5) + "\n" + rs1.getBigDecimal(4)
                            + "," + rs1.getBigDecimal(5));
                }
            }
            rs1.close();
        }
        rs.close();

    } catch (SQLException ex) {
        ex.printStackTrace();
    } finally {
        try {
            if (stmt != null) {
                stmt.close();
            }
            if (stmt2 != null) {
                stmt2.close();
            }
        } catch (SQLException ex) {
            ex.printStackTrace();
        }
    }
}

From source file:net.sourceforge.subsonic.controller.MainController.java

private boolean isMultipleArtists(List<MediaFile> children) {
    // Collect unique artist names.
    Set<String> artists = new HashSet<String>();
    for (MediaFile child : children) {
        if (child.getArtist() != null) {
            artists.add(child.getArtist().toLowerCase());
        }//from ww  w.j  a va2s .c om
    }

    // If zero or one artist, it is definitely not multiple artists.
    if (artists.size() < 2) {
        return false;
    }

    // Fuzzily compare artist names, allowing for some differences in spelling, whitespace etc.
    List<String> artistList = new ArrayList<String>(artists);
    for (String artist : artistList) {
        if (StringUtils.getLevenshteinDistance(artist, artistList.get(0)) > 3) {
            return true;
        }
    }
    return false;
}

From source file:org.apache.nutch.metadata.SpellCheckedMetadata.java

/**
 * Get the normalized name of metadata attribute name. This method tries to
 * find a well-known metadata name (one of the metadata names defined in this
 * class) that matches the specified name. The matching is error tolerent. For
 * instance,/*from   w w w.j a v  a 2  s  .  com*/
 * <ul>
 * <li>content-type gives Content-Type</li>
 * <li>CoNtEntType gives Content-Type</li>
 * <li>ConTnTtYpe gives Content-Type</li>
 * </ul>
 * If no matching with a well-known metadata name is found, then the original
 * name is returned.
 * 
 * @param name
 *          Name to normalize
 * @return normalized name
 */
public static String getNormalizedName(final String name) {
    String searched = normalize(name);
    String value = NAMES_IDX.get(searched);

    if ((value == null) && (normalized != null)) {
        int threshold = Math.min(3, searched.length() / TRESHOLD_DIVIDER);
        for (int i = 0; i < normalized.length && value == null; i++) {
            if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
                value = NAMES_IDX.get(normalized[i]);
            }
        }
    }
    return (value != null) ? value : name;
}

From source file:org.apache.tika.parser.geo.topic.GeoNameResolver.java

/**
 * Select the best match for each location name extracted from a document,
 * choosing from among a list of lists of candidate matches. Filter uses the
 * following features: 1) edit distance between name and the resolved name,
 * choose smallest one 2) content (haven't implemented)
 * //from   ww w . jav  a 2 s. c om
 * @param resolvedEntities
 *            final result for the input stream
 * @param allCandidates
 *            each location name may hits several documents, this is the
 *            collection for all hitted documents
 * @throws IOException
 * @throws RuntimeException
 */

private void pickBestCandidates(HashMap<String, ArrayList<String>> resolvedEntities,
        HashMap<String, ArrayList<ArrayList<String>>> allCandidates) {

    for (String extractedName : allCandidates.keySet()) {
        ArrayList<ArrayList<String>> cur = allCandidates.get(extractedName);
        int minDistance = Integer.MAX_VALUE, minIndex = -1;
        for (int i = 0; i < cur.size(); ++i) {
            String resolvedName = cur.get(i).get(0);// get cur's ith
            // resolved entry's name
            int distance = StringUtils.getLevenshteinDistance(extractedName, resolvedName);
            if (distance < minDistance) {
                minDistance = distance;
                minIndex = i;
            }
        }
        if (minIndex == -1)
            continue;
        resolvedEntities.put(extractedName, cur.get(minIndex));
    }
}