List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(String s, String t)
Find the Levenshtein distance between two Strings.
From source file:edu.ku.brc.specify.toycode.mexconabio.MexConvToSQL.java
/** * @param str1//from w w w .j a v a 2s. c o m * @param str2 * @param usePeriods * @return */ private double check(final String str1, final String str2, final boolean usePeriods) { if (str1 == null || str2 == null) return 0.0; String longStr; String shortStr; if (str1.length() > str2.length()) { longStr = str1; shortStr = str2; } else { longStr = str2; shortStr = str1; } longStr = clean(longStr, usePeriods); shortStr = clean(shortStr, usePeriods); int score = 0; String[] longToks = StringUtils.split(longStr, " "); String[] shortToks = StringUtils.split(shortStr, " "); for (String sStr : shortToks) { for (String lStr : longToks) { if (lStr.equals(sStr) || (sStr.length() > 2 && StringUtils.getLevenshteinDistance(lStr, sStr) < 3)) score += 1; } } double rating = (double) score / (double) shortToks.length * 100.0; //System.out.println(" L: "+shortToks.length+" Sc: "+score+" Rating: "+String.format("%5.2f", rating)); return rating; }
From source file:di.uniba.it.wsd.RevisedLesk.java
private float computeLDscore(String s1, String s2) { float maxLength = (float) Math.max(s1.length(), s2.length()); float ld = (float) StringUtils.getLevenshteinDistance(s1, s2); return 1 - ld / maxLength; }
From source file:edu.ku.brc.specify.toycode.mexconabio.AnalysisBase.java
/** * @param refRow/*from w w w . j a v a 2s. c o m*/ * @param compareRow * @return */ public int score(final Object[] refRow, final Object[] compareRow) { //Calendar cal = Calendar.getInstance(); if (maxScore == 0) { calcMaxScore(); } //String refCatNum = (String)refRow[CATNUM_INX]; String refCollectorNum = (String) refRow[COLNUM_INX]; String refGenus = (String) refRow[GENUS_INX]; String refSpecies = (String) refRow[SPECIES_INX]; String refSubSpecies = (String) refRow[SUBSPECIES_INX]; String refCollector = (String) refRow[COLLECTOR_INX]; String refLocality = (String) refRow[LOCALITY_INX]; //String refLatitude = (String)refRow[LATITUDE_INX]; //String refLongitude = (String)refRow[LONGITUDE_INX]; String refYear = (String) refRow[YEAR_INX]; String refMon = (String) refRow[MON_INX]; String refDay = (String) refRow[DAY_INX]; String refCountry = (String) refRow[COUNTRY_INX]; String refState = (String) refRow[STATE_INX]; //String catNum = (String)compareRow[CATNUM_INX]; String collectorNum = (String) compareRow[COLNUM_INX]; String genus = (String) compareRow[GENUS_INX]; String species = (String) compareRow[SPECIES_INX]; String subSpecies = (String) compareRow[SUBSPECIES_INX]; String collector = (String) compareRow[COLLECTOR_INX]; String locality = (String) compareRow[LOCALITY_INX]; //String latitude = (String)compareRow[LATITUDE_INX]; //String longitude = (String)compareRow[LONGITUDE_INX]; String year = (String) compareRow[YEAR_INX]; String mon = (String) compareRow[MON_INX]; String day = (String) compareRow[DAY_INX]; String country = (String) compareRow[COUNTRY_INX]; String state = (String) compareRow[STATE_INX]; if (country != null && country.toLowerCase().equals("mexico")) { country = "Mxico"; } /* String refCollector = null; switch (numFieldForName) { case 1 : refLastNameF = getStr(gRS, inx++); refCollector = refLastNameF; break; case 2 : refLastNameF = getStr(gRS, inx++); refFirstName = getStr(gRS, inx++); refCollector = String.format("%s %s", (StringUtils.isNotEmpty(refLastNameF) ? refLastNameF : ""), (StringUtils.isNotEmpty(refFirstName) ? refFirstName : "")).trim(); break; case 3 : refLastNameF = getStr(gRS, inx++); refLastNameM = getStr(gRS, inx++); refFirstName = getStr(gRS, inx++); refCollector = String.format("%s %s %s", (StringUtils.isNotEmpty(refLastNameF) ? refLastNameF : ""), (StringUtils.isNotEmpty(refLastNameM) ? refLastNameM : ""), (StringUtils.isNotEmpty(refFirstName) ? refFirstName : "")).trim(); break; } */ if (refCountry != null && refCountry.toLowerCase().equals("mexico")) { refCountry = "Mxico"; } if (StringUtils.isNotEmpty(refCollector)) { refCollector = StringUtils.replaceChars(refCollector, ',', ' '); } if (StringUtils.isNotEmpty(collector)) { collector = StringUtils.replaceChars(collector, ',', ' '); } int score = 0; score += compareDate(year, mon, day, refYear, refMon, refDay); String refCollectorStr = refCollector != null ? refCollector.toLowerCase() : ""; String collectorStr = collector != null ? collector.toLowerCase() : ""; String refLocStr = refLocality != null ? refLocality.toLowerCase() : ""; String locStr = locality != null ? locality.toLowerCase() : ""; double ratingLoc = longStringCompare(refLocStr, locStr, false, true); double ratingColtr = longStringCompare(refCollectorStr, collectorStr, false, true); if (ratingLoc > 50.0) { score += 10; tdColorCodes[LOCALITY_INX] = BGR; } else if (ratingLoc > 30.0) { score += 6; tdColorCodes[LOCALITY_INX] = GR; } else if (ratingLoc > 0.0) { score += 3; tdColorCodes[LOCALITY_INX] = YW; } if (ratingColtr > 50.0) { score += 10; tdColorCodes[COLLECTOR_INX] = BGR; } else if (ratingColtr > 30.0) { score += 6; tdColorCodes[COLLECTOR_INX] = GR; } else if (ratingColtr > 0.0) { score += 3; tdColorCodes[COLLECTOR_INX] = YW; } boolean genusMatches = false; if (refGenus != null && genus != null) { if (refGenus.equals(genus)) { score += 15; genusMatches = true; tdColorCodes[GENUS_INX] = GR; } else if (StringUtils.getLevenshteinDistance(genus, refGenus) < 3) { score += 7; tdColorCodes[GENUS_INX] = YW; } } boolean speciesMatches = false; if (refSpecies != null && species != null) { if (refSpecies.equals(species)) { score += 20; speciesMatches = true; if (genusMatches) { tdColorCodes[GENUS_INX] = BGR; tdColorCodes[SPECIES_INX] = BGR; } else { tdColorCodes[SPECIES_INX] = GR; } } else if (StringUtils.getLevenshteinDistance(species, refSpecies) < 3) { score += 10; tdColorCodes[SPECIES_INX] = YW; } } if (refSubSpecies != null && subSpecies != null) { if (refSubSpecies.equals(subSpecies)) { score += 20; if (genusMatches && speciesMatches) { tdColorCodes[GENUS_INX] = BGR; tdColorCodes[SPECIES_INX] = BGR; tdColorCodes[SUBSPECIES_INX] = BGR; } else if (speciesMatches) { tdColorCodes[SPECIES_INX] = BGR; tdColorCodes[SUBSPECIES_INX] = BGR; } else { tdColorCodes[SUBSPECIES_INX] = GR; } } else if (StringUtils.getLevenshteinDistance(subSpecies, refSubSpecies) < 3) { score += 10; tdColorCodes[SUBSPECIES_INX] = YW; } } if (refCountry != null && country != null) { refCountry = refCountry.toLowerCase(); country = country.toLowerCase(); if (refCountry.equals(country)) { score += 10; tdColorCodes[COUNTRY_INX] = BGR; } else if (StringUtils.getLevenshteinDistance(country, refCountry) < 3) { score += 5; tdColorCodes[COUNTRY_INX] = YW; } } if (refState != null && state != null) { refState = refState.toLowerCase(); state = state.toLowerCase(); if (refState.equals(state)) { score += 10; tdColorCodes[STATE_INX] = BGR; } else if (StringUtils.getLevenshteinDistance(state, refState) < 3) { score += 5; tdColorCodes[STATE_INX] = YW; } } /*if (refGenus != null && species != null && refGenus.equals(species)) { cCls[3] = DF; cCls[2] = DF; score += 10; } if (refSpecies != null && genus != null && refSpecies.equals(genus)) { cCls[2] = DF; cCls[3] = DF; score += 15; }*/ if (collectorNum != null && refCollectorNum != null && collectorNum.equals(refCollectorNum)) { score += 10; tdColorCodes[COLNUM_INX] = BGR; } return score; }
From source file:edu.ku.brc.specify.toycode.mexconabio.AnalysisBase.java
/** * @param str1//from w w w . j av a 2 s .c o m * @param str2 * @param usePeriods * @return */ protected double longStringCompare(final String str1, final String str2, final boolean usePeriods, final boolean replaceWithSpaces) { if (str1 == null || str2 == null) return 0.0; String longStr; String shortStr; if (str1.length() >= str2.length()) { longStr = str1; shortStr = str2; } else { longStr = str2; shortStr = str1; } longStr = cleanString(longStr, usePeriods, replaceWithSpaces); shortStr = cleanString(shortStr, usePeriods, replaceWithSpaces); int score = 0; String[] longToks = StringUtils.split(longStr, " "); String[] shortToks = StringUtils.split(shortStr, " "); for (String sStr : shortToks) { for (String lStr : longToks) { if (lStr.equals(sStr) || (sStr.length() > 2 && StringUtils.getLevenshteinDistance(lStr, sStr) < 3)) score += 1; } } double rating = (double) score / (double) shortToks.length * 100.0; //System.out.println(" L: "+shortToks.length+" Sc: "+score+" Rating: "+String.format("%5.2f", rating)); return rating; }
From source file:com.ntua.cosmos.hackathonplanneraas.Planner.java
@Deprecated private String getPOSTBody(JSONObject obj) { String body = "{}"; StoredPaths pan = new StoredPaths(); JSONObject returned = new JSONObject(); ArrayList<String> names = new ArrayList<>(); ArrayList<String> values = new ArrayList<>(); String originalString = ""; double similarity = 0.0, current = 0.0; if (!obj.isEmpty()) { Set keys = obj.keySet();// w w w. j a v a 2s. c om Iterator iter = keys.iterator(); for (; iter.hasNext();) { String temporary = String.valueOf(iter.next()); if (temporary.startsWith("has")) names.add(temporary); } names.stream().forEach((name) -> { values.add(String.valueOf(obj.get(name))); }); } originalString = values.stream().map((value) -> value).reduce(originalString, String::concat); //Begin the initialisation process. OntModelSpec s = new OntModelSpec(PelletReasonerFactory.THE_SPEC); //s.setReasoner(PelletReasonerFactory.theInstance().create()); OntDocumentManager dm = OntDocumentManager.getInstance(); dm.setFileManager(FileManager.get()); s.setDocumentManager(dm); OntModel m = ModelFactory.createOntologyModel(s, null); //OntModel m = ModelFactory.createOntologyModel( PelletReasonerFactory.THE_SPEC ); InputStream in = FileManager.get().open(StoredPaths.casebasepath); if (in == null) { throw new IllegalArgumentException("File: " + StoredPaths.casebasepath + " not found"); } // read the file m.read(in, null); //begin building query string. String queryString = pan.prefixrdf + pan.prefixowl + pan.prefixxsd + pan.prefixrdfs + pan.prefixCasePath; queryString += "\nSELECT DISTINCT "; for (int i = 0; i < names.size(); i++) { queryString += "?param" + i + " "; } queryString += "?message ?handle ?URI ?body WHERE {"; for (int i = 0; i < names.size(); i++) { queryString += "?event base:" + names.get(i) + " ?param" + i + " . "; } queryString += "?event base:isSolvedBy ?solution . ?solution base:exposesMessage ?message . ?solution base:eventHandledBy ?handle . ?solution base:URI ?URI . ?solution base:hasPOSTBody ?body}"; try { String testString = ""; Query query = QueryFactory.create(queryString); //System.out.println(String.valueOf(query)); QueryExecution qe = QueryExecutionFactory.create(query, m); ResultSet results = qe.execSelect(); for (; results.hasNext();) { QuerySolution soln = results.nextSolution(); // Access variables: soln.get("x"); Literal lit; for (int i = 0; i < names.size(); i++) { lit = soln.getLiteral("param" + i);// Get a result variable by name. String temporary = String.valueOf(lit).substring(0, String.valueOf(lit).indexOf("^^")); testString += temporary; } String longer = testString, shorter = originalString; if (testString.length() < originalString.length()) { // longer should always have greater length longer = originalString; shorter = testString; } int longerLength = longer.length(); current = (longerLength - StringUtils.getLevenshteinDistance(longer, shorter)) / (double) longerLength; if (similarity < current) { similarity = current; returned.clear(); returned.put("message", soln.getLiteral("message").getString()); returned.put("URI", soln.getLiteral("URI").getString()); returned.put("handle", soln.getLiteral("handle").getString()); body = soln.getLiteral("body").getString(); } } } catch (Exception e) { System.out.println("Search is interrupted by an error."); } m.close(); return body; }
From source file:au.org.ala.names.search.ALANameSearcher.java
/** * Uses the distance between 2 strings to determine whether or not the * 2 strings are a close match./*from ww w . ja v a 2 s. c om*/ * * @param s1 * @param s2 * @param maxLengthDif The maximum differences in length that the 2 strings can be * @param maxDist The maximum distance between the 2 strings * @return */ private boolean isCloseMatch(String s1, String s2, int maxLengthDif, int maxDist) { if (s1 != null && s2 != null && Math.abs(s1.length() - s2.length()) <= maxLengthDif) { //if the difference in the length of the 2 strings is at the most maxLengthDif characters compare the L distance //log.debug("Difference ("+s1 + ", " + s2+") : " + StringUtils.getLevenshteinDistance(s1, s2)); return StringUtils.getLevenshteinDistance(s1, s2) <= maxDist; } return false; }
From source file:edu.ku.brc.specify.Specify.java
/** * //from w w w. j av a 2s . com */ protected void localities() { Connection connection = DBConnection.getInstance().getConnection(); Statement stmt = null; Statement stmt2 = null; try { stmt = connection.createStatement(); stmt2 = connection.createStatement(); String sql = "SELECT l.LocalityID, l.LocalityName, g.FullName, l.Latitude1, l.Longitude1 FROM locality l INNER JOIN geography g ON l.GeographyID = g.GeographyID"; ResultSet rs = stmt.executeQuery(sql); while (rs.next()) { String currLocalityName = rs.getString(2); ResultSet rs1 = stmt2.executeQuery(sql + " WHERE g.FullName = \"" + rs.getString(3) + "\" AND l.LocalityID <> " + rs.getInt(1)); while (rs1.next()) { String localityName = rs1.getString(2); int distance = StringUtils.getLevenshteinDistance(currLocalityName, localityName); //log.error(rs.getInt(1) + " "+ rs1.getInt(1) + " "+ distance); if (distance < 6) { log.error("----- " + distance + "\n" + currLocalityName + "\n" + localityName); log.error(rs.getBigDecimal(4) + "," + rs.getBigDecimal(5) + "\n" + rs1.getBigDecimal(4) + "," + rs1.getBigDecimal(5)); } } rs1.close(); } rs.close(); } catch (SQLException ex) { ex.printStackTrace(); } finally { try { if (stmt != null) { stmt.close(); } if (stmt2 != null) { stmt2.close(); } } catch (SQLException ex) { ex.printStackTrace(); } } }
From source file:net.sourceforge.subsonic.controller.MainController.java
private boolean isMultipleArtists(List<MediaFile> children) { // Collect unique artist names. Set<String> artists = new HashSet<String>(); for (MediaFile child : children) { if (child.getArtist() != null) { artists.add(child.getArtist().toLowerCase()); }//from ww w.j a va2s .c om } // If zero or one artist, it is definitely not multiple artists. if (artists.size() < 2) { return false; } // Fuzzily compare artist names, allowing for some differences in spelling, whitespace etc. List<String> artistList = new ArrayList<String>(artists); for (String artist : artistList) { if (StringUtils.getLevenshteinDistance(artist, artistList.get(0)) > 3) { return true; } } return false; }
From source file:org.apache.nutch.metadata.SpellCheckedMetadata.java
/** * Get the normalized name of metadata attribute name. This method tries to * find a well-known metadata name (one of the metadata names defined in this * class) that matches the specified name. The matching is error tolerent. For * instance,/*from w w w.j a v a 2 s . com*/ * <ul> * <li>content-type gives Content-Type</li> * <li>CoNtEntType gives Content-Type</li> * <li>ConTnTtYpe gives Content-Type</li> * </ul> * If no matching with a well-known metadata name is found, then the original * name is returned. * * @param name * Name to normalize * @return normalized name */ public static String getNormalizedName(final String name) { String searched = normalize(name); String value = NAMES_IDX.get(searched); if ((value == null) && (normalized != null)) { int threshold = Math.min(3, searched.length() / TRESHOLD_DIVIDER); for (int i = 0; i < normalized.length && value == null; i++) { if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) { value = NAMES_IDX.get(normalized[i]); } } } return (value != null) ? value : name; }
From source file:org.apache.tika.parser.geo.topic.GeoNameResolver.java
/** * Select the best match for each location name extracted from a document, * choosing from among a list of lists of candidate matches. Filter uses the * following features: 1) edit distance between name and the resolved name, * choose smallest one 2) content (haven't implemented) * //from ww w . jav a 2 s. c om * @param resolvedEntities * final result for the input stream * @param allCandidates * each location name may hits several documents, this is the * collection for all hitted documents * @throws IOException * @throws RuntimeException */ private void pickBestCandidates(HashMap<String, ArrayList<String>> resolvedEntities, HashMap<String, ArrayList<ArrayList<String>>> allCandidates) { for (String extractedName : allCandidates.keySet()) { ArrayList<ArrayList<String>> cur = allCandidates.get(extractedName); int minDistance = Integer.MAX_VALUE, minIndex = -1; for (int i = 0; i < cur.size(); ++i) { String resolvedName = cur.get(i).get(0);// get cur's ith // resolved entry's name int distance = StringUtils.getLevenshteinDistance(extractedName, resolvedName); if (distance < minDistance) { minDistance = distance; minIndex = i; } } if (minIndex == -1) continue; resolvedEntities.put(extractedName, cur.get(minIndex)); } }