Example usage for org.apache.commons.lang StringUtils getLevenshteinDistance

List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance

Introduction

In this page you can find the example usage for org.apache.commons.lang StringUtils getLevenshteinDistance.

Prototype

public static int getLevenshteinDistance(String s, String t) 

Source Link

Document

Find the Levenshtein distance between two Strings.

Usage

From source file:io.insideout.stanbol.enhancer.nlp.freeling.Freeling.java

@SuppressWarnings("unchecked")
public Freeling(final String configurationPath, final String configurationFilenameSuffix,
        final String freelingSharePath, final String freelingLibPath, final String locale,
        final int maxInitThreads, final int poolSize, final int minQueueSize) {
    //determine the supported languages
    File configDir = new File(configurationPath);
    if (!configDir.isDirectory()) {
        throw new IllegalArgumentException(
                "The parsed configDirectory '" + configDir + "' is not a directory!");
    }/*w w w  .j  a  v a 2 s .com*/
    log.info("Reading Freeling Configuration from Directory: {}", configDir);
    Map<String, File> supportedLanguages = new HashMap<String, File>();
    String langIdConfigFile = null;
    if (configDir.isDirectory()) {
        for (File confFile : (Collection<File>) FileUtils.listFiles(configDir,
                new SuffixFileFilter(configurationFilenameSuffix), null)) {
            Properties prop = new Properties();
            InputStream in = null;
            try {
                in = new FileInputStream(confFile);
                prop.load(in);
                String lang = prop.getProperty("Lang");
                String langIdentFileName = prop.getProperty("LangIdentFile");
                if (lang != null) { //not a Analyzer config
                    File existing = supportedLanguages.get(lang);
                    if (existing == null) {
                        log.info(" ... adding language '{}' with config {}", lang, confFile);
                        supportedLanguages.put(lang, confFile);
                    } else { //two configs for the same language
                        //take the one that is more similar to the language name
                        int eld = StringUtils.getLevenshteinDistance(lang,
                                FilenameUtils.getBaseName(existing.getName()));
                        int cld = StringUtils.getLevenshteinDistance(lang,
                                FilenameUtils.getBaseName(confFile.getName()));
                        if (cld < eld) {
                            log.info(" ... setting language '{}' to config {}", lang, confFile);
                            supportedLanguages.put(lang, confFile);
                        }
                    }
                } else if (langIdentFileName != null) {
                    if (langIdentFileName.startsWith("$FREELING")) {
                        langIdentFileName = FilenameUtils.concat(freelingSharePath,
                                langIdentFileName.substring(langIdentFileName.indexOf(File.separatorChar) + 1));
                    }
                    if (langIdConfigFile != null) {
                        log.warn(
                                "Multiple LanguageIdentification configuration files. "
                                        + "Keep using '{}' and ignore '{}'!",
                                langIdConfigFile, langIdentFileName);
                    } else {
                        log.info(" ... setting language identification config to '{}'", langIdentFileName);
                        langIdConfigFile = langIdentFileName;
                    }
                }
            } catch (IOException e) {
                log.error("Unable to read configuration file " + confFile, e);
            } finally {
                IOUtils.closeQuietly(in);
            }
        }
    }
    //init the ThreadPool used to create Freeling components
    //this is mainly needed for beeing able to ensure that only one Freeling
    //component is created at a time. This may be necessary in some
    //environment to avoid random crashes.
    freelingInitThreadPool = Executors
            .newFixedThreadPool(maxInitThreads <= 0 ? DEFAULT_CONCURRENT_THREADS : maxInitThreads);
    //Init the Analyzers
    if (supportedLanguages.isEmpty()) {
        log.warn("The parsed configDirectory '{}' does not contain any valid "
                + "language configuration (*.{}) files!", configDir, configurationFilenameSuffix);
    } else {
        AnalyzerFactory analyzerFactory = new AnalyzerFactory(freelingLibPath, freelingSharePath, locale,
                freelingInitThreadPool);
        //now init the ResourcePool(s)
        log.info("init ResourcePools (size: " + poolSize + ")");
        for (Entry<String, File> supported : supportedLanguages.entrySet()) {
            Map<String, Object> context = new HashMap<String, Object>();
            context.put(AnalyzerFactory.PROPERTY_LANGUAGE, supported.getKey());
            context.put(AnalyzerFactory.PROPERTY_CONFIG_FILE, supported.getValue());
            log.debug(" ... create ResourcePool for {}", context);
            analyzerPools.put(supported.getKey(),
                    new ResourcePool<Analyzer>(poolSize, minQueueSize, analyzerFactory, context));
        }
    }
    if (langIdConfigFile == null) {
        log.warn(
                "The parsed configDirectory '{}' does not contain the "
                        + "Language Identification Component configuration (a *.{}) file! "
                        + "Language Identification Service will not ba available.",
                configDir, configurationFilenameSuffix);
        langIdPool = null;
    } else {
        LangIdFactory langIdFactory = new LangIdFactory(freelingLibPath, langIdConfigFile, locale,
                freelingInitThreadPool);
        //Finally init the language identifier resource pool
        langIdPool = new ResourcePool<LanguageIdentifier>(poolSize, minQueueSize, langIdFactory, null);
    }
}

From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java

/**
 * Attempts to produce a list of names that are close to the given name. At most 5 proposals
 * are generated. The returned proposals are made in order of "pronunciation distance" which is
 * obtained by taking the Levenshtein distance between the Double Monophone encodings of
 * candidate and given name. Candidates are selected as the names with shortest Levenshtein distance
 * and names that are Monophonically equal, or starts or ends monophonically.
 * /*from  ww  w  .  ja  v  a  2 s .  c  om*/
 * @param currentName
 *            the name for which proposals are to be generated
 * @param descs
 *            the descriptors of available named values
 * @param searchPath
 *            TODO
 * @param types
 *            if stated, the wanted types of named values
 * @return
 *         array of proposals, possibly empty, but never null.
 */
public String[] computeProposals(final String currentName, Collection<IEObjectDescription> descs,
        boolean upperCaseProposals, PPSearchPath searchPath, EClass... types) {
    if (currentName == null || currentName.length() < 1)
        return new String[0];

    // compute the 5 best matches and only accept if score <= 5
    ScoreKeeper<IEObjectDescription> tracker = new ScoreKeeper<IEObjectDescription>(5, false, 5);
    // List<IEObjectDescription> metaphoneAlike = Lists.newArrayList();
    final DoubleMetaphone encoder = new DoubleMetaphone();
    final String metaphoneName = encoder.encode(currentName);

    for (IEObjectDescription d : descs) {
        EClass c = d.getEClass();
        typeok: if (types != null && types.length > 0) {
            for (EClass wanted : types)
                if ((wanted == c || wanted.isSuperTypeOf(c)))
                    break typeok;
            continue;
        }
        // filter based on path visibility
        if (searchPath.searchIndexOf(d) == -1)
            continue; // not visible according to path

        String candidateName = converter.toString(d.getName());
        tracker.addScore(StringUtils.getLevenshteinDistance(currentName, candidateName), d);
        String candidateMetaphone = encoder.encode(candidateName);
        // metaphone matches are scored on the pronounciation distance
        if (metaphoneName.equals(candidateMetaphone) //
                || candidateMetaphone.startsWith(metaphoneName) //
                || candidateMetaphone.endsWith(metaphoneName) //
        )
            tracker.addScore(StringUtils.getLevenshteinDistance(metaphoneName, candidateMetaphone), d);
        // System.err.printf("Metaphone alike: %s == %s\n", currentName, candidateName);
    }
    List<String> result = Lists.newArrayList();
    // System.err.print("Scores = ");
    for (ScoreEntry<IEObjectDescription> entry : tracker.getScoreEntries()) {
        String s = converter.toString(entry.getData().getName());
        result.add(s);
        // System.err.printf("%d %s, ", entry.getScore(), s);
    }
    // System.err.println();

    String[] proposals = result.toArray(new String[result.size()]);

    PronunciationComparator x = new PronunciationComparator(encoder, metaphoneName);

    Arrays.sort(proposals, x);
    // System.err.print("Order = ");
    // for(int i = 0; i < proposals.length; i++)
    // System.err.printf("%s, ", proposals[i]);
    // System.err.println();
    return upperCaseProposals ? toUpperCaseProposals(proposals) : proposals;
}

From source file:com.att.ajsc.csilogging.util.UtilLib.java

public static String getInput(String pathinfoArr[], int arrLength, String componentType, String pathInfo) {
    Set<String> endpointSet = null;
    /*//from  w w w  .  j a v  a2  s  . c  o  m
     * if (componentType.equalsIgnoreCase("rest")) { endpointSet =
     * DME2Helper.restletEndpointSet; } else { endpointSet =
     * DME2Helper.serviceEndpointSet; }
     */
    HashSet<String> setBasedArrLenth = new HashSet<String>();
    HashMap setBasedCharMap = new HashMap();
    HashSet<String> setBasedValues = new HashSet<String>();
    AntPathMatcher pathMatcher = new AntPathMatcher();

    String inputBasedonLength[];
    int globalvalue = 0;
    for (String s : endpointSet) {
        int dif = StringUtils.getLevenshteinDistance(pathInfo, s);

        if (globalvalue == 0 || globalvalue > dif) {
            globalvalue = dif;
            setBasedCharMap.put(globalvalue, s);
        }

        inputBasedonLength = s.split("\\/");
        int i = inputBasedonLength.length;
        if (arrLength == i) {
            setBasedArrLenth.add(s);
        }
    }

    String inputBasedOnValues[];
    for (String s1 : setBasedArrLenth) {
        inputBasedOnValues = s1.split("\\/");

        int j = 1;
        while (compareValues(pathinfoArr[j], inputBasedOnValues[j])) {
            j++;
            if (j >= arrLength) {
                break;
            }
        }
        if (j == arrLength) {
            setBasedValues.add(s1);
        }
    }
    String input = "";

    if (setBasedValues.size() == 1) {
        for (String s2 : setBasedValues) {
            input = s2;
        }
    } else {
        for (String s2 : setBasedValues) {
            if (pathMatcher.match(pathInfo, s2)) {
                input = s2;
            }
        }
    }
    if (input.isEmpty()) {
        input = (String) setBasedCharMap.get(globalvalue);

    }
    return "/" + componentType + input;
}

From source file:bookChapter.theoretical.AnalyzeTheoreticalMSMSCalculation.java

private static String printInfo(HashSet<Identify> theBestAndromedaResults,
        HashSet<Identify> theBestSEQUESTResults) {
    ///    bw.write("Score" + "\t" + "LevenshteinDistance" + "\t" + "Sequence" + "\t" + "Charge" + "\t" + "Correct" + "\t" + "Spec" + "\n");
    String result = "";
    HashMap<String, Integer> results_and_levDist = new HashMap<String, Integer>();
    for (Identify andromeda : theBestAndromedaResults) {
        for (Identify sequest : theBestSEQUESTResults) {
            double observedMass = CalculateObservedMass.calculateMass(andromeda.getSpectrum());
            int levenshteinDistance = StringUtils.getLevenshteinDistance(andromeda.getPeptide().getSequence(),
                    sequest.getPeptide().getSequence());
            result = andromeda.getSpectrum().getSpectrumTitle() + "\t"
                    + andromeda.getSpectrum().getPrecursor().getMz() + "\t"
                    + andromeda.getSpectrum().getPrecursor().getPossibleChargesAsString() + "\t" + observedMass
                    + "\t" + andromeda.getScore() + "\t" + sequest.getScore() + "\t"
                    + andromeda.getPeptide().getSequence() + "\t" + sequest.getPeptide().getSequence() + "\t"
                    + levenshteinDistance + "\t" + sequest.getTotalScoredPeps() + "\t"
                    + andromeda.isIsCorrectMatch() + "\t" + sequest.isIsCorrectMatch() + "\n";
            results_and_levDist.put(result, levenshteinDistance);
        }//  w ww.j  a  v  a  2 s.  com
    }
    int maxDist = Integer.MAX_VALUE, dist = 0;
    for (String res : results_and_levDist.keySet()) {
        dist = results_and_levDist.get(res);
        if (dist < maxDist) {
            result = res;
            maxDist = dist;
        }
    }
    return result;
}

From source file:com.example.app.support.address.SpellingCorrector.java

private static float getNormalizedSimilarity(String s, String t) {
    return 1f - StringUtils.getLevenshteinDistance(s, t) / (float) Math.max(s.length(), t.length());
}

From source file:eu.scape_project.bitwiser.utils.SSDeep.java

static int scoreStrings(char[] s1, char[] s2, int blockSize) {
    int score = 0;
    int len1, len2;

    len1 = s1.length;//from   w w w  .j a v  a  2  s  .  c om
    len2 = s2.length;

    if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
        /* not a real spamsum signature? */
        return 0;
    }

    /* the two strings must have a common substring of length
       ROLLING_WINDOW to be candidates */
    /*
    if (has_common_substring(s1, s2) == 0) {
      return 0;
    }
    */

    /* compute the edit distance between the two strings. The edit distance gives
       us a pretty good idea of how closely related the two strings are */
    score = StringUtils.getLevenshteinDistance(new String(s1), new String(s2));

    /* scale the edit distance by the lengths of the two
       strings. This changes the score to be a measure of the
       proportion of the message that has changed rather than an
       absolute quantity. It also copes with the variability of
       the string lengths. */
    score = (score * SPAMSUM_LENGTH) / (len1 + len2);

    /* at this stage the score occurs roughly on a 0-64 scale,
     * with 0 being a good match and 64 being a complete
     * mismatch */

    /* rescale to a 0-100 scale (friendlier to humans) */
    score = (100 * score) / 64;

    /* it is possible to get a score above 100 here, but it is a
       really terrible match */
    if (score >= 100) {
        return 0;
    }

    /* now re-scale on a 0-100 scale with 0 being a poor match and
       100 being a excellent match. */
    score = 100 - score;

    //  printf ("len1: %"PRIu32"  len2: %"PRIu32"\n", len1, len2);

    /* when the blocksize is small we don't want to exaggerate the match size */
    if (score > blockSize / MIN_BLOCKSIZE * Math.min(len1, len2)) {
        score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2);
    }
    return score;
}

From source file:com.github.bfour.fpliteraturecollector.service.DefaultLiteratureService.java

/**
 * Determines whether a pair of two Literatures maybe is a duplicate pair.
 * Employs comparative measures that might lead to false positives.
 * //from  w  w w. j  av  a2  s. c o  m
 * @param litA
 * @param litB
 * @return
 */
private boolean isProbableDuplicate(Literature litA, Literature litB) {

    if (isCertainDuplicate(litA, litB))
        return true;

    // 1 character different for every 14 characters
    if (StringUtils.getLevenshteinDistance(
            Normalizer.normalize(litA.getTitle(), Normalizer.Form.NFD).toLowerCase(),
            Normalizer.normalize(litB.getTitle(), Normalizer.Form.NFD)
                    .toLowerCase()) <= (litA.getTitle().length() / 14))
        return true;

    if (litA.getISBN() != null && litB.getISBN() != null && litA.getISBN().equals(litB.getISBN()))
        return true;

    return false;

}

From source file:de.janrieke.contractmanager.ext.hibiscus.UmsatzImportListDialog.java

private float getRelativeLevenshteinDistance(String a, String b) {
    int distance = StringUtils.getLevenshteinDistance(a.toLowerCase(), b.toLowerCase());
    int maxLength = Math.max(a.length(), b.length());
    int minLength = Math.min(a.length(), b.length());
    int maxDistance = maxLength;
    int minDistance = maxLength - minLength;
    return (float) Math.sqrt(Math.sqrt((((float) (distance - minDistance)) / ((float) maxDistance))));
}

From source file:com.adobe.acs.commons.mcp.impl.processes.asset.UrlAssetImport.java

private int compareName(FileOrRendition a, FileOrRendition b, String fileName) {
    int aDist = StringUtils.getLevenshteinDistance(a.getName().toLowerCase(), fileName);
    int bDist = StringUtils.getLevenshteinDistance(b.getName().toLowerCase(), fileName);
    return Integer.compare(aDist, bDist);
}

From source file:edu.ku.brc.specify.toycode.mexconabio.MexConvToSQL.java

/**
 * @param type/* w ww  .java2s  . c om*/
 * @param rs
 * @param tblWriter
 * @param doingMapTable
 * @throws SQLException
 */
public void process(final int type, final ResultSet rs, final TableWriter tblWriter,
        final boolean doingMapTable) throws SQLException {
    Integer idEjemplar = rs.getInt(1);
    String catNum = rs.getString(2).trim();
    String collectorNum = rs.getString(3).trim();
    String collector = rs.getString(4);
    String genus = rs.getString(9);
    String species = rs.getString(10);
    String locality = rs.getString(5);

    int year = rs.getInt(11);
    int mon = rs.getInt(12);
    int day = rs.getInt(13);
    boolean hasDate = year > 0 && mon > 0 && day > 0;
    String dateStr = hasDate ? String.format("%04d-%02d-%02d", year, mon, day) : "&nbsp;";

    String where = null;
    switch (type) {
    case 0:
        where = String.format(
                "FROM conabio WHERE GenusName = '%s' AND SpeciesName = '%s' AND CollNr = '%s' AND BarCD = '%s'",
                genus, species, collectorNum, catNum);
        break;

    case 1:
        where = String.format("FROM conabio WHERE CollNr = '%s' AND BarCD = '%s'", collectorNum, catNum);
        break;

    case 2:
        where = String.format("FROM conabio WHERE CollNr = '%s' AND BarCD <> '%s'", collectorNum, catNum);
        break;

    case 3:
        where = String.format("FROM conabio WHERE CollNr <> '%s' AND BarCD = '%s'", collectorNum, catNum);
        break;
    }

    String sql2 = "SELECT COUNT(*) " + where;
    int cnt = BasicSQLUtils.getCountAsInt(conn, sql2);

    if (cnt == 1) {
        for (int i = 0; i < cCls.length; i++) {
            cCls[i] = null;
            mCls[i] = null;
        }

        if (doingMapTable) {
            sql2 = "SELECT ID " + where;
            int id = BasicSQLUtils.getCountAsInt(conn, sql2);

            mpStmt.setInt(1, idEjemplar);
            mpStmt.setInt(2, id);
            mpStmt.executeUpdate();
        }

        sql2 = "SELECT BarCD, CollNr, GenusName, SpeciesName, Collectoragent1, LocalityName, Datecollstandrd "
                + where;
        Vector<Object[]> rows = BasicSQLUtils.query(conn, sql2);
        Object[] cols = rows.get(0);
        Date date = (Date) cols[6];

        boolean michHasDate = date != null;
        String michDate = michHasDate ? sdf.format(date) : "&nbsp;";

        Integer michCatNumInt = (Integer) cols[0];
        String michCatNum = michCatNumInt.toString();
        String michColNum = ((String) cols[1]).trim();
        String michGenus = ((String) cols[2]).trim();
        String michSpecies = (String) cols[3];
        String michCollector = (String) cols[4];
        String michLocality = (String) cols[5];

        int michYear = 0;
        int michMon = 0;
        int michDay = 0;
        if (date != null) {
            cal.setTime(date);
            michYear = michHasDate ? cal.get(Calendar.YEAR) : 0;
            michMon = michHasDate ? cal.get(Calendar.MONTH) + 1 : 0;
            michDay = michHasDate ? cal.get(Calendar.DATE) : 0;
        }

        int maxScore = 115;
        int score = 0;

        if (hasDate && michHasDate) {
            score += year == michYear ? 10 : 0;
            score += mon == michMon ? 20 : 0;
            score += day == michDay ? 30 : 0;

            if (year == michYear && mon == michMon && day == michDay && year != 0 && mon != 0 && day != 0) {
                cCls[6] = BGR;
                mCls[6] = BGR;

            } else if (year == michYear && mon == michMon && year != 0 && mon != 0) {
                cCls[6] = GR;
                mCls[6] = GR;

            } else if (year == michYear) {
                cCls[6] = YW;
                mCls[6] = YW;
            }
        }

        double ratingLoc = check(locality, michLocality, false);
        double ratingColtr = check(collector, michCollector, false);

        if (ratingLoc > 50.0) {
            cCls[5] = BGR;
            mCls[5] = BGR;
            score += 10;

        } else if (ratingLoc > 30.0) {
            cCls[5] = GR;
            mCls[5] = GR;
            score += 6;

        } else if (ratingLoc > 0.0) {
            cCls[5] = YW;
            mCls[5] = YW;
            score += 3;
        }

        if (ratingColtr > 50.0) {
            cCls[4] = BGR;
            mCls[4] = BGR;
            score += 10;

        } else if (ratingColtr > 30.0) {
            cCls[4] = GR;
            mCls[4] = GR;
            score += 6;

        } else if (ratingColtr > 0.0) {
            cCls[4] = YW;
            mCls[4] = YW;
            score += 3;
        }

        boolean genusMatches = false;
        if (michGenus != null && genus != null) {
            if (michGenus.equals(genus)) {
                score += 15;
                cCls[2] = GR;
                mCls[2] = GR;
                genusMatches = true;

            } else if (StringUtils.getLevenshteinDistance(genus, michGenus) < 3) {
                score += 7;
                cCls[2] = YW;
                mCls[2] = YW;
            }
        }

        if (michSpecies != null && species != null) {
            if (michSpecies.equals(species)) {
                score += 20;
                if (genusMatches) {
                    cCls[2] = BGR;
                    mCls[2] = BGR;
                    cCls[3] = BGR;
                    mCls[3] = BGR;
                } else {
                    cCls[3] = GR;
                    mCls[3] = GR;
                }
            } else if (StringUtils.getLevenshteinDistance(species, michSpecies) < 3) {
                score += 10;
                cCls[3] = YW;
                mCls[3] = YW;
            }
        }

        if (michGenus != null && species != null && michGenus.equals(species)) {
            cCls[3] = DF;
            mCls[2] = DF;
            score += 10;
        }

        if (michSpecies != null && genus != null && michSpecies.equals(genus)) {
            cCls[2] = DF;
            mCls[3] = DF;
            score += 15;
        }

        if (catNum.equals(michCatNum)) {
            cCls[1] = BGR;
            mCls[1] = BGR;
        }

        if (collectorNum.equals(michColNum)) {
            cCls[0] = BGR;
            mCls[0] = BGR;
        }

        int finalScore = (int) ((((double) score / maxScore) * 100.0) + 0.5);
        histo[finalScore]++;
        totalScore += finalScore;

        String scoreStr = String.format("%d", finalScore);
        tblWriter.println("<TR>");

        tblWriter.logTDCls(cCls[0], collectorNum);
        tblWriter.logTDCls(cCls[1], catNum);
        tblWriter.logTDCls(cCls[2], genus);
        tblWriter.logTDCls(cCls[3], species);
        tblWriter.logTDCls(cCls[4], collector);
        tblWriter.logTDCls(cCls[5], locality);
        tblWriter.logTDCls(cCls[6], dateStr);

        tblWriter.logTDCls(cCls[0], michColNum);
        tblWriter.logTDCls(cCls[1], michCatNum != null ? michCatNum.toString() : "&nbsp;");
        tblWriter.logTDCls(mCls[2], michGenus);
        tblWriter.logTDCls(mCls[3], michSpecies);
        tblWriter.logTDCls(mCls[4], michCollector);
        tblWriter.logTDCls(mCls[5], michLocality);
        tblWriter.logTDCls(mCls[6], michDate);

        tblWriter.logTDCls(null, scoreStr);
        tblWriter.println("</TR>");

        fndCnt++;
        System.out.println("Fnd: " + fndCnt + "  Num Recs: " + numRecs + "  Dif: " + (numRecs - fndCnt));
    }
    numRecs++;
}