List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(String s, String t)
Find the Levenshtein distance between two Strings.
From source file:pl.edu.icm.coansys.commons.stringsimilarity.EditDistanceSimilarity.java
@Override protected float doCalculate(String s1, String s2) { int levenshteinDistance = StringUtils.getLevenshteinDistance(s1, s2); int maxLength = Math.max(s1.length(), s2.length()); int normalizedLength; if (maxNormalizedStringLength > 0) { double factor = 2.0 * maxNormalizedStringLength / Math.PI; normalizedLength = (int) Math.round(factor * Math.atan((double) maxLength / factor)); } else {//from w ww. ja va 2s. c om normalizedLength = maxLength; } if (maxLength == 0) { return 1.0f; } else if (levenshteinDistance > disapproveLevel * normalizedLength) { return 0.0f; } else if (levenshteinDistance < approveLevel * normalizedLength) { return 1.0f; } else { return (disapproveLevel * normalizedLength - levenshteinDistance) / ((disapproveLevel - approveLevel) * normalizedLength); } }
From source file:pl.polzone.classifier.Classifier.java
private String stem(String word) { if (stems.containsKey(word)) return stems.get(word); for (String stem : stems.keySet()) if (StringUtils.getLevenshteinDistance(stem, word) < STEM_THRESHOLD || StringUtils.getLevenshteinDistance(stems.get(stem), word) < STEM_THRESHOLD) return stem; stems.put(word, word);/*from w ww.j a v a2 s . c o m*/ return word; }
From source file:SkypeBot.QA_System.java
private String getAnswer(String userQuestion) { // set all record distance maximum 999 by default for (int i = 0; i < QA_Record.size(); i++) QA_Record.get(i).setDistance(999); // match user qustion with DB question, if find any question match then find minimum distance question // and return answer of minimum distance question. for (int i = 0; i < QA_Record.size(); i++) { Matcher m = QA_Record.get(i).getQuestion().matcher(userQuestion); // find all matched question records if (m.find()) { // set minimum distance of each matched record. QA_Record.get(i).setDistance(StringUtils.getLevenshteinDistance(userQuestion, m.group())); // System.out.println(QA_Record.get(i).getDistance()+" "+QA_Record.get(i).getAnswer()); }// www.ja v a2 s . c o m } // return valid answer if distance >=0 and distance < 999 if ((getSmallestDistance(QA_Record).getDistance() >= 0) && (getSmallestDistance(QA_Record).getDistance() < 999)) { // return smallest distance object answer. return getSmallestDistance(QA_Record).getAnswer(); } return ""; }
From source file:uk.ac.ebi.apps.benchmark.ChemicalNameSearch.java
private SummaryStatistics getHitIndices(Multimap<String, Set<String>> results, FingerprintEncoder encoder) { SummaryStatistics summaryStatistics = new SummaryStatistics(); QUERY: for (String name : results.keySet()) { String normName = encoder.encode(name); StringBuffer buffer = new StringBuffer(); List<Set<String>> hits = new ArrayList<Set<String>>(results.get(name)); for (int i = 0; i < hits.size(); i++) { Set<String> hitNames = hits.get(i); for (String hit : hitNames) { String normHit = encoder.encode(hit); buffer.append("\t").append(hit); buffer.append("\t").append(normHit); buffer.append("\t").append(StringUtils.getLevenshteinDistance(normName, normHit)); buffer.append("\n"); if (normName.equals(normHit)) { summaryStatistics.addValue(i + 1); continue QUERY; }//from ww w. j a v a2s . c o m } } } return summaryStatistics; }
From source file:uk.ac.ebi.apps.benchmark.ChemicalNameSearch.java
private int getRealScore(Multimap<String, Set<String>> results, FingerprintEncoder encoder, File file) { int hitCount = 0; FileWriter writer = null;//from ww w . j a v a2s . c o m try { if (file != null) writer = new FileWriter(file); } catch (IOException e) { System.err.println(e.getMessage()); } QUERY: for (String name : results.keySet()) { String normName = encoder.encode(name); StringBuffer buffer = new StringBuffer(); for (Set<String> hitNames : results.get(name)) { for (String hit : hitNames) { String normHit = encoder.encode(hit); buffer.append("\t").append(hit); buffer.append("\t").append(normHit); buffer.append("\t").append(StringUtils.getLevenshteinDistance(normName, normHit)); buffer.append("\n"); if (normName.equals(normHit)) { hitCount++; continue QUERY; } } } try { if (writer != null) { writer.write(name + "\n"); writer.write(buffer.toString() + "\n"); } } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } } try { if (writer != null) writer.close(); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } System.out.println("Written unmatched results to :" + file); return hitCount; }
From source file:uk.ac.ebi.intact.dbupdate.prot.listener.ReportWriterListener.java
@Override public void onProteinSequenceChanged(ProteinSequenceChangeEvent evt) throws ProcessorException { final Protein protein = evt.getProtein(); try {//from w w w . j a va 2 s. c om final ReportWriter writer = reportHandler.getSequenceChangedWriter(); String primaryId = evt.getUniprotIdentity() != null ? evt.getUniprotIdentity() : getPrimaryIdString(protein); if (evt.getOldSequence() != null) { writer.writeLine(">" + protein.getAc() + "|OLD|" + protein.getShortLabel() + "|" + primaryId + "|CRC:" + Crc64.getCrc64(evt.getOldSequence()) + "|Length:" + evt.getOldSequence().length()); writer.writeLine(insertNewLinesIfNecessary(evt.getOldSequence(), 80)); } String state; int seqDiff; int levenshtein; double conservation; if (evt.getOldSequence() != null) { state = "UPDATE"; seqDiff = evt.getNewSequence().length() - evt.getOldSequence().length(); levenshtein = StringUtils.getLevenshteinDistance(evt.getNewSequence(), evt.getOldSequence()); conservation = ProteinTools.calculateSequenceConservation(evt.getOldSequence(), evt.getNewSequence()); } else { state = "NEW"; seqDiff = evt.getNewSequence().length(); levenshtein = seqDiff; conservation = 0; } int sequenceLength = evt.getNewSequence().length(); writer.writeLine(">" + protein.getAc() + "|" + state + "|" + protein.getShortLabel() + "|" + getPrimaryIdString(protein) + "|CRC:" + evt.getUniprotCrc64() + "|Length:" + Integer.toString(sequenceLength) + "|Diff:" + seqDiff + "|Levenshtein:" + levenshtein + "|Conservation:" + conservation); writer.writeLine(insertNewLinesIfNecessary(evt.getNewSequence(), 80)); writer.flush(); } catch (Exception e) { log.fatal("Problem writing to sequence changed writer", e); } }
From source file:uk.ac.ebi.mdk.tool.resolve.AbstractCandidateFactory.java
/** * Calculates then Levenshtein distance for the query and subject strings using the set * StringEncoder//from w w w .j a v a2s . c o m * * @param encodedQuery query which is been pre-encoded * @param subject * * @return */ public Integer calculateDistance(String encodedQuery, String subject) { return StringUtils.getLevenshteinDistance(encodedQuery, encoder.encode(subject)); }
From source file:uk.ac.ebi.mdk.tool.resolve.PseudoFingerprintChemicalNames.java
/** * Returns the levenshein distance between the compared keys for these strings. * @param a//from w ww .j av a 2 s. com * @param b * @return */ public int lenvenshteinComparisonKeyed(String a, String b) { String keyA = this.key(a); String keyB = this.key(b); return StringUtils.getLevenshteinDistance(keyA, keyB); }
From source file:uk.ac.ebi.mdk.ui.component.MetaboliteMatchIndication.java
public void setSubject(Metabolite subject) { this.subject = subject; name.setRight(subject.getName());//from w w w. jav a2 s . c om Integer nameDiff = StringUtils.getLevenshteinDistance(name.getLeft().toLowerCase(), subject.getName().toLowerCase()); name.setDifference(nameDiff.toString()); name.setQuality(nameDiff <= 2 ? MatchIndication.Quality.Good : nameDiff <= 5 ? MatchIndication.Quality.Okay : MatchIndication.Quality.Bad); double queryCharge = query.getCharge(); double subjectCharge = subject.getCharge(); charge.setRight(Double.toString(subjectCharge)); double chargeDiff = Math.abs(queryCharge - subjectCharge); charge.setQuality(chargeDiff < 1 ? MatchIndication.Quality.Good : chargeDiff < 2 ? MatchIndication.Quality.Okay : MatchIndication.Quality.Bad); setFormulaQuality(); }
From source file:uk.ac.ebi.metabolomes.webservices.util.EntryDecider.java
/** * /*from w ww . j a va 2 s. co m*/ * @param query * @param candidates * @return * @deprecated {@see getOrderedCandidates(String, Map<String , String> candidates)} This uses a list and thus does not colapse entries with duplicate */ @Deprecated public Set<CandidateEntry> decideBestCandidate(String query, Map<String, String> candidates) { Set<CandidateEntry> orderedCand = new TreeSet<CandidateEntry>(); for (String identifier : candidates.keySet()) { CandidateEntry candidateT = new CandidateEntry(); candidateT.setId(identifier); candidateT.setDesc(candidates.get(identifier)); String prefix = this.identiferPrefix(identifier); if (prefix != null && candidateT.getDesc().contains(prefix)) { candidateT.setDesc(candidateT.getDesc().replace(prefix, "")); } candidateT.setDistance(StringUtils.getLevenshteinDistance(query.toLowerCase().trim(), candidateT.getDesc().toLowerCase().trim())); orderedCand.add(candidateT); } return orderedCand; }