Example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(CharSequence s, CharSequence t, final int threshold) 

Source Link

Document

<p>Find the Levenshtein distance between two Strings if it's less than or equal to a given threshold.</p> <p>This is the number of changes needed to change one String into another, where each change is a single character modification (deletion, insertion or substitution).</p> <p>This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance algorithm from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> <pre> StringUtils.getLevenshteinDistance(null, *, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, null, *) = IllegalArgumentException StringUtils.getLevenshteinDistance(*, *, -1) = IllegalArgumentException StringUtils.getLevenshteinDistance("","", 0) = 0 StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7 StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7 StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1 StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7 StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1 StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7 StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1 </pre>

Usage

From source file:gov.nyc.doitt.gis.geoclient.service.search.policy.DefaultSimilarNamesPolicy.java

@Override
public boolean isSimilarName(String original, String proposed) {
    return StringUtils.getLevenshteinDistance(clean(original), clean(proposed), similarNamesDistance) >= 0;
}

From source file:kenh.expl.functions.GetLevenshteinDistance.java

public int process(String str1, String str2, int threshold) {
    if (threshold < 0)
        threshold = 0;//www  . j a v a2  s  .com

    try {
        return StringUtils.getLevenshteinDistance(str1, str2, threshold);
    } catch (Exception e) {
        return 0;
    }
}

From source file:de.knowwe.compile.correction.IncrementalTermReferenceCorrectionProvider.java

@Override
public List<Suggestion> getSuggestions(TermCompiler compiler, Section<?> section, int threshold) {
    List<Suggestion> suggestions = new LinkedList<>();
    if (!(section.get() instanceof SimpleReference)) {
        return suggestions;
    }//  www  .  jav a2s  . co m

    ReferenceManager terminology = IncrementalCompiler.getInstance().getTerminology();

    if (terminology.isValid(new Identifier(section.getText()))) {
        // if reference is valid, no correction is proposed
        return suggestions;
    }

    Collection<Section<? extends SimpleDefinition>> defs = terminology.getAllTermDefinitions();

    String originalText = section.getText();

    for (Section<? extends SimpleDefinition> def : defs) {
        Identifier termIdentifier = KnowWEUtils.getTermIdentifier(def);
        String termIdentifierElement = termIdentifier.getLastPathElement();
        String originalTextRegex = originalText.replace(" ", ".*");

        /* levenstein test */
        double score = StringUtils.getLevenshteinDistance(originalText, termIdentifierElement, threshold);
        if (score >= 0) {
            suggestions.add(new DefaultSuggestion(termIdentifierElement, (int) score));
        }
        /* infix test */
        else if (termIdentifierElement.matches(".*" + originalTextRegex + ".*")) {
            int infixScore = termIdentifierElement.length() - originalText.length();
            suggestions.add(new DefaultSuggestion(termIdentifierElement, infixScore));
        }
    }

    return suggestions;
}

From source file:knowledgeMiner.mining.wikipedia.CategoryMembershipMiner.java

@Override
protected void mineArticleInternal(MinedInformation info, int informationRequested, WMISocket wmi,
        OntologySocket ontology) throws Exception {
    int artID = info.getArticle();
    String artTitle = wmi.getPageTitle(artID, true);
    Collection<Integer> categories = wmi.getArticleCategories(artID);
    for (Integer category : categories) {
        String categoryTitle = wmi.getPageTitle(category, true);
        // Remove the word 'stub(s)'
        categoryTitle = categoryTitle.replaceAll(" stubs?", "");
        if (categoryTitle.equals(artTitle))
            continue;

        // Special category parsing
        if (parseSpecial(categoryTitle, info, ontology, wmi))
            continue;

        // Check article title similarity
        int result = StringUtils.getLevenshteinDistance(artTitle, categoryTitle, 3);
        if (result != -1)
            continue;

        // Treat the category as a chunk of text to be parsed
        String sentence = SentenceParserHeuristic.SENTENCE_PREFIX + categoryTitle + ".";
        miner_.mineSentence(sentence, false, info, this, ontology, wmi);
    }//from  w w  w.j ava  2  s. c om
}

From source file:com.stratio.crossdata.core.utils.ParserUtils.java

/**
 * Get the best matches for a string {@code str} given a set of words to compare against and a maximum Levenshtein
 * distance.// ww  w .  j  ava 2s .  c om
 *
 * @param str         The word to get the matches for.
 * @param words       The set of candidate words.
 * @param maxDistance The maximum Levenshtein distance.
 * @return A set of matching words within distance.
 */
public static Set<String> getBestMatches(String str, Set<String> words, int maxDistance) {
    int limit = maxDistance + 1;
    int currentLimit = 1;
    Set<String> result = new HashSet<>();
    while (result.isEmpty() && currentLimit < limit) {
        for (String word : words) {
            int distance = StringUtils.getLevenshteinDistance(str, word, maxDistance);
            if ((distance > -1) && (distance < currentLimit)) {
                result.add(word);
            }
        }
        currentLimit++;
    }

    return result;
}

From source file:at.jps.sanction.core.util.TokenTool.java

/**
 * compare single token with list of tokens no modification of tokens is done
 *
 * @param text1//from ww  w .  j  a v a2  s  . c  om
 * @param text2
 * @param fuzzy
 *            ( if true) LevenshteinDistance is used
 * @param minlen
 * @param fuzzyValue
 *            ( 20 - 80 % )
 * @return Percentage of of equality ( 0 - 100 %)
 */

public static int compareCheck(final String text1, final String text2, final boolean fuzzy, final int minlen,
        final double fuzzyValue) {

    int deltaValue = text1.length();
    int minWordLen = deltaValue;
    int percentHitrate = 0;

    // +/- fuzzy string compare

    // only compare meaningfull !! deltalength > 80%
    // TODO: calc length to fuzzyness

    if (((text1.length() >= minlen) && (text2.length() >= minlen))
            && (Math.abs(text1.length() - text2.length()) <= minlen)) {

        if (fuzzy) {

            minWordLen = Math.min(text1.length(), text2.length());
            final int threshold = (int) (minWordLen * fuzzyValue) + 1;

            deltaValue = StringUtils.getLevenshteinDistance(text1, text2, threshold);

            if (deltaValue == -1) // threshold cutoff
            {
                deltaValue = minWordLen;
            }
        } else {
            if (text1.equalsIgnoreCase(text2)) { // TODO: ignoreCase ?
                deltaValue = 0;
            }
        }
    }

    percentHitrate = (int) ((1 - ((double) deltaValue / minWordLen)) * 100);

    // percentHitrate = 100 - ((100 / ((float) (text1.length()))) * deltaValue);
    // percentHitrate = 100 - ((100 / minWordLen) * deltaValue);

    return percentHitrate;
}

From source file:org.easotope.client.rawdata.batchimport.ThreadedFileReader.java

private void processFile(File file, boolean fileWasFound) {
    byte[] fileBytes = null;

    try {/*from   ww  w.  j a  v a 2s .c  o  m*/
        fileBytes = FileReader.getBytesFromFile(file.getCanonicalPath());
    } catch (Exception e) {
        batchImportComposite.getDisplay().asyncExec(() -> {
            if (!batchImportComposite.isDisposed()) {
                Shell shell = batchImportComposite.getParent().getShell();
                MessageDialog.openError(shell, Messages.threadedFileReader_fileAddErrorTitle, e.getMessage());
            }
        });

        return;
    }

    RawFile rawFile = new RawFile();
    rawFile.setOriginalName(file.getName());

    ComputeAcquisitionParsed computeAcquisitionParsed = null;

    try {
        computeAcquisitionParsed = new ComputeAcquisitionParsed(rawFile, fileBytes, false, null);
    } catch (RuntimeException e) {
        if (!fileWasFound) {
            batchImportComposite.getDisplay().asyncExec(() -> {
                if (!batchImportComposite.isDisposed()) {
                    Shell shell = batchImportComposite.getParent().getShell();
                    MessageDialog.openError(shell, Messages.threadedFileReader_fileAddErrorTitle,
                            e.getMessage());
                }
            });
        }

        return;
    }

    if (assumedTimeZone == null) {
        assumedTimeZone = computeAcquisitionParsed.getAssumedTimeZone();
    }

    int acquisitionNumber = 0;
    for (AcquisitionParsedV2 acquisitionParsed : computeAcquisitionParsed.getMaps()) {
        ImportedFile importedFile = new ImportedFile();
        importedFile.setAssumedTimeZone(computeAcquisitionParsed.getAssumedTimeZone() != null);
        importedFile.setTimestamp(acquisitionParsed.getDate());
        importedFile.setAcquisitionNumber(acquisitionNumber++);
        importedFile.setFileName(file.getName());

        String id1 = (String) acquisitionParsed.getMisc().get(InputParameter.Identifier_1);
        String id2 = (String) acquisitionParsed.getMisc().get(InputParameter.Identifier_2);
        String sampleName = (String) acquisitionParsed.getMisc().get(InputParameter.Sample_Name);

        if (id1 != null || id2 != null) {
            importedFile.setIdentifier1(id1);
            importedFile.setIdentifier2(id2);
        } else {
            importedFile.setIdentifier1(sampleName);
        }

        String firstString = importedFile.getIdentifier1();
        int firstStringFactor = 1;

        if (firstString != null && firstString.length() != 0) {
            firstString = firstString.toUpperCase();
            firstStringFactor = firstString.length();
        }

        String secondString = importedFile.getIdentifier2();
        int secondStringFactor = 1;

        if (secondString != null && secondString.length() != 0) {
            secondString = secondString.toUpperCase();
            secondStringFactor = secondString.length();
        }

        ArrayList<SourceListItem> sourceList = null;

        if (globalSourceList != null) {
            ArrayList<SourceListItemSorter> comboItems = new ArrayList<SourceListItemSorter>();

            for (SourceListItem sourceListItem : globalSourceList) {
                int levenshteinDistance = -1;
                SourceListItemSorter sorter = null;

                if (firstString != null && firstString.length() != 0) {
                    int distance = StringUtils.getLevenshteinDistance(firstString,
                            sourceListItem.getSourceNameToUpper(), MAX_LEVENSHTEIN_DISTANCE)
                            * secondStringFactor;

                    if (distance >= 0 && (levenshteinDistance == -1 || distance < levenshteinDistance)) {
                        levenshteinDistance = distance;
                        sorter = new SourceListItemSorter(levenshteinDistance, sourceListItem);
                    }
                }

                if (secondString != null && secondString.length() != 0) {
                    int distance = StringUtils.getLevenshteinDistance(secondString,
                            sourceListItem.getSourceNameToUpper(), MAX_LEVENSHTEIN_DISTANCE)
                            * firstStringFactor;

                    if (distance >= 0 && (levenshteinDistance == -1 || distance < levenshteinDistance)) {
                        levenshteinDistance = distance;
                        sorter = new SourceListItemSorter(levenshteinDistance, sourceListItem);
                    }
                }

                if (levenshteinDistance >= 0) {
                    comboItems.add(sorter);
                }
            }

            Collections.sort(comboItems);
            sourceList = new ArrayList<SourceListItem>();

            for (SourceListItemSorter sourceListItemSorter : comboItems) {
                sourceList.add(sourceListItemSorter.getSourceListItem());

                if (sourceList.size() == MAX_COMBO_SIZE) {
                    break;
                }
            }
        }

        importedFile.setSourceList(sourceList);

        if (sourceList.size() != 0) {
            importedFile.setSampleId(sourceList.get(0).getSampleId());
            importedFile.setStandardId(sourceList.get(0).getStandardId());
        } else {
            importedFile.setSampleId(DatabaseConstants.EMPTY_DB_ID);
            importedFile.setStandardId(DatabaseConstants.EMPTY_DB_ID);
        }

        batchImportComposite.getDisplay().asyncExec(() -> {
            if (!batchImportComposite.isDisposed()) {
                batchImportComposite.addImportedFile(importedFile);
            }
        });
    }
}

From source file:org.jahia.tools.maven.plugins.LegalArtifactAggregator.java

/**
 * Find the closest matching license using a LevenshteinDistance edit distance algorithm because the two license
 * texts. If the edit distance is larger than the EDIT_DISTANCE_THRESHOLD it is possible that no license matches,
 * which is what we want if we are actually not matching a real license.
 * @param licenseFile the license we want to match against the known licenses.
 * @return//from   w  w w.  j  av  a 2s .  c om
 */
public KnownLicense findClosestMatchingKnownLicense(LicenseFile licenseFile) {
    KnownLicense closestMatchingKnownLicense = null;
    int smallestEditDistance = Integer.MAX_VALUE;
    for (KnownLicense knownLicense : knownLicenses.getLicenses().values()) {
        for (TextVariant textVariant : knownLicense.getTextVariants()) {
            int editDistance = StringUtils.getLevenshteinDistance(textVariant.getText(), licenseFile.getText(),
                    EDIT_DISTANCE_THRESHOLD);
            if (editDistance >= 0 && editDistance < smallestEditDistance) {
                smallestEditDistance = editDistance;
                closestMatchingKnownLicense = knownLicense;
            }
        }
    }
    return closestMatchingKnownLicense;
}

From source file:qa.experiment.ProcessFeatureVector.java

public int isNameFuzzyMatch(String[] str1, String[] str2) {
    for (int i = 0; i < str1.length; i++) {
        for (int j = 0; j < str2.length; j++) {
            int dist = StringUtils.getLevenshteinDistance(str1[i].trim(), str2[j].trim(), 3);
            if (dist != -1) {
                String prefix = str1[i].substring(0, 3);
                if (str2[j].trim().startsWith(prefix)) {
                    return i;
                }/*w  w  w .  j a  va 2s. co  m*/
            }
        }
    }

    return -1;
}