Example usage for org.apache.commons.lang StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(String s, String t) 

Source Link

Document

Find the Levenshtein distance between two Strings.

Usage

From source file:pl.edu.icm.coansys.commons.stringsimilarity.EditDistanceSimilarity.java

@Override
protected float doCalculate(String s1, String s2) {

    int levenshteinDistance = StringUtils.getLevenshteinDistance(s1, s2);
    int maxLength = Math.max(s1.length(), s2.length());

    int normalizedLength;
    if (maxNormalizedStringLength > 0) {
        double factor = 2.0 * maxNormalizedStringLength / Math.PI;
        normalizedLength = (int) Math.round(factor * Math.atan((double) maxLength / factor));
    } else {//from  w  ww. ja va 2s. c om
        normalizedLength = maxLength;
    }

    if (maxLength == 0) {
        return 1.0f;
    } else if (levenshteinDistance > disapproveLevel * normalizedLength) {
        return 0.0f;
    } else if (levenshteinDistance < approveLevel * normalizedLength) {
        return 1.0f;
    } else {
        return (disapproveLevel * normalizedLength - levenshteinDistance)
                / ((disapproveLevel - approveLevel) * normalizedLength);
    }
}

From source file:pl.polzone.classifier.Classifier.java

private String stem(String word) {
    if (stems.containsKey(word))
        return stems.get(word);

    for (String stem : stems.keySet())
        if (StringUtils.getLevenshteinDistance(stem, word) < STEM_THRESHOLD
                || StringUtils.getLevenshteinDistance(stems.get(stem), word) < STEM_THRESHOLD)
            return stem;

    stems.put(word, word);/*from w  ww.j a v  a2  s  . c  o  m*/

    return word;
}

From source file:SkypeBot.QA_System.java

private String getAnswer(String userQuestion) {
    // set all record distance maximum 999 by default       
    for (int i = 0; i < QA_Record.size(); i++)
        QA_Record.get(i).setDistance(999);

    // match user qustion with DB question, if find any question match then find minimum distance question 
    // and return answer of minimum distance question.
    for (int i = 0; i < QA_Record.size(); i++) {
        Matcher m = QA_Record.get(i).getQuestion().matcher(userQuestion);

        // find all matched question records
        if (m.find()) {
            // set minimum distance of each matched record.
            QA_Record.get(i).setDistance(StringUtils.getLevenshteinDistance(userQuestion, m.group()));
            //                System.out.println(QA_Record.get(i).getDistance()+" "+QA_Record.get(i).getAnswer());
        }// www.ja v  a2  s  . c o m
    }

    // return valid answer if distance >=0 and distance < 999
    if ((getSmallestDistance(QA_Record).getDistance() >= 0)
            && (getSmallestDistance(QA_Record).getDistance() < 999)) {
        // return smallest distance object answer.
        return getSmallestDistance(QA_Record).getAnswer();
    }

    return "";
}

From source file:uk.ac.ebi.apps.benchmark.ChemicalNameSearch.java

private SummaryStatistics getHitIndices(Multimap<String, Set<String>> results, FingerprintEncoder encoder) {

    SummaryStatistics summaryStatistics = new SummaryStatistics();

    QUERY: for (String name : results.keySet()) {

        String normName = encoder.encode(name);

        StringBuffer buffer = new StringBuffer();

        List<Set<String>> hits = new ArrayList<Set<String>>(results.get(name));

        for (int i = 0; i < hits.size(); i++) {

            Set<String> hitNames = hits.get(i);

            for (String hit : hitNames) {

                String normHit = encoder.encode(hit);

                buffer.append("\t").append(hit);
                buffer.append("\t").append(normHit);
                buffer.append("\t").append(StringUtils.getLevenshteinDistance(normName, normHit));
                buffer.append("\n");

                if (normName.equals(normHit)) {
                    summaryStatistics.addValue(i + 1);
                    continue QUERY;
                }//from   ww  w. j  a v  a2s  . c o m

            }

        }

    }

    return summaryStatistics;

}

From source file:uk.ac.ebi.apps.benchmark.ChemicalNameSearch.java

private int getRealScore(Multimap<String, Set<String>> results, FingerprintEncoder encoder, File file) {

    int hitCount = 0;

    FileWriter writer = null;//from   ww  w .  j a v  a2s . c o m

    try {
        if (file != null)
            writer = new FileWriter(file);
    } catch (IOException e) {
        System.err.println(e.getMessage());
    }

    QUERY: for (String name : results.keySet()) {

        String normName = encoder.encode(name);

        StringBuffer buffer = new StringBuffer();

        for (Set<String> hitNames : results.get(name)) {

            for (String hit : hitNames) {

                String normHit = encoder.encode(hit);

                buffer.append("\t").append(hit);
                buffer.append("\t").append(normHit);
                buffer.append("\t").append(StringUtils.getLevenshteinDistance(normName, normHit));
                buffer.append("\n");

                if (normName.equals(normHit)) {
                    hitCount++;
                    continue QUERY;
                }

            }

        }

        try {
            if (writer != null) {
                writer.write(name + "\n");
                writer.write(buffer.toString() + "\n");
            }
        } catch (IOException e) {
            e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
        }

    }

    try {
        if (writer != null)
            writer.close();
    } catch (IOException e) {
        e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
    }

    System.out.println("Written unmatched results to :" + file);

    return hitCount;

}

From source file:uk.ac.ebi.intact.dbupdate.prot.listener.ReportWriterListener.java

@Override
public void onProteinSequenceChanged(ProteinSequenceChangeEvent evt) throws ProcessorException {
    final Protein protein = evt.getProtein();
    try {//from   w  w w . j  a va 2 s.  c  om
        final ReportWriter writer = reportHandler.getSequenceChangedWriter();
        String primaryId = evt.getUniprotIdentity() != null ? evt.getUniprotIdentity()
                : getPrimaryIdString(protein);
        if (evt.getOldSequence() != null) {
            writer.writeLine(">" + protein.getAc() + "|OLD|" + protein.getShortLabel() + "|" + primaryId
                    + "|CRC:" + Crc64.getCrc64(evt.getOldSequence()) + "|Length:"
                    + evt.getOldSequence().length());
            writer.writeLine(insertNewLinesIfNecessary(evt.getOldSequence(), 80));
        }

        String state;
        int seqDiff;
        int levenshtein;
        double conservation;

        if (evt.getOldSequence() != null) {
            state = "UPDATE";
            seqDiff = evt.getNewSequence().length() - evt.getOldSequence().length();
            levenshtein = StringUtils.getLevenshteinDistance(evt.getNewSequence(), evt.getOldSequence());
            conservation = ProteinTools.calculateSequenceConservation(evt.getOldSequence(),
                    evt.getNewSequence());
        } else {
            state = "NEW";
            seqDiff = evt.getNewSequence().length();
            levenshtein = seqDiff;
            conservation = 0;
        }
        int sequenceLength = evt.getNewSequence().length();
        writer.writeLine(">" + protein.getAc() + "|" + state + "|" + protein.getShortLabel() + "|"
                + getPrimaryIdString(protein) + "|CRC:" + evt.getUniprotCrc64() + "|Length:"
                + Integer.toString(sequenceLength) + "|Diff:" + seqDiff + "|Levenshtein:" + levenshtein
                + "|Conservation:" + conservation);
        writer.writeLine(insertNewLinesIfNecessary(evt.getNewSequence(), 80));
        writer.flush();
    } catch (Exception e) {
        log.fatal("Problem writing to sequence changed writer", e);
    }
}

From source file:uk.ac.ebi.mdk.tool.resolve.AbstractCandidateFactory.java

/**
 * Calculates then Levenshtein distance for the query and subject strings using the set
 * StringEncoder//from w  w w .j a v a2s .  c  o m
 *
 * @param encodedQuery query which is been pre-encoded
 * @param subject
 *
 * @return
 */
public Integer calculateDistance(String encodedQuery, String subject) {
    return StringUtils.getLevenshteinDistance(encodedQuery, encoder.encode(subject));
}

From source file:uk.ac.ebi.mdk.tool.resolve.PseudoFingerprintChemicalNames.java

/**
 * Returns the levenshein distance between the compared keys for these strings.
 * @param a//from   w  ww  .j  av a  2  s.  com
 * @param b
 * @return 
 */
public int lenvenshteinComparisonKeyed(String a, String b) {
    String keyA = this.key(a);
    String keyB = this.key(b);

    return StringUtils.getLevenshteinDistance(keyA, keyB);
}

From source file:uk.ac.ebi.mdk.ui.component.MetaboliteMatchIndication.java

public void setSubject(Metabolite subject) {
    this.subject = subject;

    name.setRight(subject.getName());//from w  w  w.  jav a2 s . c  om

    Integer nameDiff = StringUtils.getLevenshteinDistance(name.getLeft().toLowerCase(),
            subject.getName().toLowerCase());
    name.setDifference(nameDiff.toString());
    name.setQuality(nameDiff <= 2 ? MatchIndication.Quality.Good
            : nameDiff <= 5 ? MatchIndication.Quality.Okay : MatchIndication.Quality.Bad);

    double queryCharge = query.getCharge();
    double subjectCharge = subject.getCharge();

    charge.setRight(Double.toString(subjectCharge));

    double chargeDiff = Math.abs(queryCharge - subjectCharge);

    charge.setQuality(chargeDiff < 1 ? MatchIndication.Quality.Good
            : chargeDiff < 2 ? MatchIndication.Quality.Okay : MatchIndication.Quality.Bad);

    setFormulaQuality();

}

From source file:uk.ac.ebi.metabolomes.webservices.util.EntryDecider.java

/**
 * /*from   w  ww .  j a va  2 s.  co  m*/
 * @param query
 * @param candidates
 * @return
 * @deprecated {@see getOrderedCandidates(String, Map<String , String> candidates)} This uses a list and thus does not colapse entries with duplicate
 */
@Deprecated
public Set<CandidateEntry> decideBestCandidate(String query, Map<String, String> candidates) {

    Set<CandidateEntry> orderedCand = new TreeSet<CandidateEntry>();
    for (String identifier : candidates.keySet()) {
        CandidateEntry candidateT = new CandidateEntry();
        candidateT.setId(identifier);
        candidateT.setDesc(candidates.get(identifier));
        String prefix = this.identiferPrefix(identifier);
        if (prefix != null && candidateT.getDesc().contains(prefix)) {
            candidateT.setDesc(candidateT.getDesc().replace(prefix, ""));
        }
        candidateT.setDistance(StringUtils.getLevenshteinDistance(query.toLowerCase().trim(),
                candidateT.getDesc().toLowerCase().trim()));

        orderedCand.add(candidateT);
    }

    return orderedCand;
}