List of usage examples for org.apache.commons.lang StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(String s, String t)
Find the Levenshtein distance between two Strings.
From source file:io.insideout.stanbol.enhancer.nlp.freeling.Freeling.java
@SuppressWarnings("unchecked") public Freeling(final String configurationPath, final String configurationFilenameSuffix, final String freelingSharePath, final String freelingLibPath, final String locale, final int maxInitThreads, final int poolSize, final int minQueueSize) { //determine the supported languages File configDir = new File(configurationPath); if (!configDir.isDirectory()) { throw new IllegalArgumentException( "The parsed configDirectory '" + configDir + "' is not a directory!"); }/*w w w .j a v a 2 s .com*/ log.info("Reading Freeling Configuration from Directory: {}", configDir); Map<String, File> supportedLanguages = new HashMap<String, File>(); String langIdConfigFile = null; if (configDir.isDirectory()) { for (File confFile : (Collection<File>) FileUtils.listFiles(configDir, new SuffixFileFilter(configurationFilenameSuffix), null)) { Properties prop = new Properties(); InputStream in = null; try { in = new FileInputStream(confFile); prop.load(in); String lang = prop.getProperty("Lang"); String langIdentFileName = prop.getProperty("LangIdentFile"); if (lang != null) { //not a Analyzer config File existing = supportedLanguages.get(lang); if (existing == null) { log.info(" ... adding language '{}' with config {}", lang, confFile); supportedLanguages.put(lang, confFile); } else { //two configs for the same language //take the one that is more similar to the language name int eld = StringUtils.getLevenshteinDistance(lang, FilenameUtils.getBaseName(existing.getName())); int cld = StringUtils.getLevenshteinDistance(lang, FilenameUtils.getBaseName(confFile.getName())); if (cld < eld) { log.info(" ... setting language '{}' to config {}", lang, confFile); supportedLanguages.put(lang, confFile); } } } else if (langIdentFileName != null) { if (langIdentFileName.startsWith("$FREELING")) { langIdentFileName = FilenameUtils.concat(freelingSharePath, langIdentFileName.substring(langIdentFileName.indexOf(File.separatorChar) + 1)); } if (langIdConfigFile != null) { log.warn( "Multiple LanguageIdentification configuration files. " + "Keep using '{}' and ignore '{}'!", langIdConfigFile, langIdentFileName); } else { log.info(" ... setting language identification config to '{}'", langIdentFileName); langIdConfigFile = langIdentFileName; } } } catch (IOException e) { log.error("Unable to read configuration file " + confFile, e); } finally { IOUtils.closeQuietly(in); } } } //init the ThreadPool used to create Freeling components //this is mainly needed for beeing able to ensure that only one Freeling //component is created at a time. This may be necessary in some //environment to avoid random crashes. freelingInitThreadPool = Executors .newFixedThreadPool(maxInitThreads <= 0 ? DEFAULT_CONCURRENT_THREADS : maxInitThreads); //Init the Analyzers if (supportedLanguages.isEmpty()) { log.warn("The parsed configDirectory '{}' does not contain any valid " + "language configuration (*.{}) files!", configDir, configurationFilenameSuffix); } else { AnalyzerFactory analyzerFactory = new AnalyzerFactory(freelingLibPath, freelingSharePath, locale, freelingInitThreadPool); //now init the ResourcePool(s) log.info("init ResourcePools (size: " + poolSize + ")"); for (Entry<String, File> supported : supportedLanguages.entrySet()) { Map<String, Object> context = new HashMap<String, Object>(); context.put(AnalyzerFactory.PROPERTY_LANGUAGE, supported.getKey()); context.put(AnalyzerFactory.PROPERTY_CONFIG_FILE, supported.getValue()); log.debug(" ... create ResourcePool for {}", context); analyzerPools.put(supported.getKey(), new ResourcePool<Analyzer>(poolSize, minQueueSize, analyzerFactory, context)); } } if (langIdConfigFile == null) { log.warn( "The parsed configDirectory '{}' does not contain the " + "Language Identification Component configuration (a *.{}) file! " + "Language Identification Service will not ba available.", configDir, configurationFilenameSuffix); langIdPool = null; } else { LangIdFactory langIdFactory = new LangIdFactory(freelingLibPath, langIdConfigFile, locale, freelingInitThreadPool); //Finally init the language identifier resource pool langIdPool = new ResourcePool<LanguageIdentifier>(poolSize, minQueueSize, langIdFactory, null); } }
From source file:com.puppetlabs.geppetto.pp.dsl.contentassist.PPProposalsGenerator.java
/** * Attempts to produce a list of names that are close to the given name. At most 5 proposals * are generated. The returned proposals are made in order of "pronunciation distance" which is * obtained by taking the Levenshtein distance between the Double Monophone encodings of * candidate and given name. Candidates are selected as the names with shortest Levenshtein distance * and names that are Monophonically equal, or starts or ends monophonically. * /*from ww w . ja v a 2 s . c om*/ * @param currentName * the name for which proposals are to be generated * @param descs * the descriptors of available named values * @param searchPath * TODO * @param types * if stated, the wanted types of named values * @return * array of proposals, possibly empty, but never null. */ public String[] computeProposals(final String currentName, Collection<IEObjectDescription> descs, boolean upperCaseProposals, PPSearchPath searchPath, EClass... types) { if (currentName == null || currentName.length() < 1) return new String[0]; // compute the 5 best matches and only accept if score <= 5 ScoreKeeper<IEObjectDescription> tracker = new ScoreKeeper<IEObjectDescription>(5, false, 5); // List<IEObjectDescription> metaphoneAlike = Lists.newArrayList(); final DoubleMetaphone encoder = new DoubleMetaphone(); final String metaphoneName = encoder.encode(currentName); for (IEObjectDescription d : descs) { EClass c = d.getEClass(); typeok: if (types != null && types.length > 0) { for (EClass wanted : types) if ((wanted == c || wanted.isSuperTypeOf(c))) break typeok; continue; } // filter based on path visibility if (searchPath.searchIndexOf(d) == -1) continue; // not visible according to path String candidateName = converter.toString(d.getName()); tracker.addScore(StringUtils.getLevenshteinDistance(currentName, candidateName), d); String candidateMetaphone = encoder.encode(candidateName); // metaphone matches are scored on the pronounciation distance if (metaphoneName.equals(candidateMetaphone) // || candidateMetaphone.startsWith(metaphoneName) // || candidateMetaphone.endsWith(metaphoneName) // ) tracker.addScore(StringUtils.getLevenshteinDistance(metaphoneName, candidateMetaphone), d); // System.err.printf("Metaphone alike: %s == %s\n", currentName, candidateName); } List<String> result = Lists.newArrayList(); // System.err.print("Scores = "); for (ScoreEntry<IEObjectDescription> entry : tracker.getScoreEntries()) { String s = converter.toString(entry.getData().getName()); result.add(s); // System.err.printf("%d %s, ", entry.getScore(), s); } // System.err.println(); String[] proposals = result.toArray(new String[result.size()]); PronunciationComparator x = new PronunciationComparator(encoder, metaphoneName); Arrays.sort(proposals, x); // System.err.print("Order = "); // for(int i = 0; i < proposals.length; i++) // System.err.printf("%s, ", proposals[i]); // System.err.println(); return upperCaseProposals ? toUpperCaseProposals(proposals) : proposals; }
From source file:com.att.ajsc.csilogging.util.UtilLib.java
public static String getInput(String pathinfoArr[], int arrLength, String componentType, String pathInfo) { Set<String> endpointSet = null; /*//from w w w . j a v a2 s . c o m * if (componentType.equalsIgnoreCase("rest")) { endpointSet = * DME2Helper.restletEndpointSet; } else { endpointSet = * DME2Helper.serviceEndpointSet; } */ HashSet<String> setBasedArrLenth = new HashSet<String>(); HashMap setBasedCharMap = new HashMap(); HashSet<String> setBasedValues = new HashSet<String>(); AntPathMatcher pathMatcher = new AntPathMatcher(); String inputBasedonLength[]; int globalvalue = 0; for (String s : endpointSet) { int dif = StringUtils.getLevenshteinDistance(pathInfo, s); if (globalvalue == 0 || globalvalue > dif) { globalvalue = dif; setBasedCharMap.put(globalvalue, s); } inputBasedonLength = s.split("\\/"); int i = inputBasedonLength.length; if (arrLength == i) { setBasedArrLenth.add(s); } } String inputBasedOnValues[]; for (String s1 : setBasedArrLenth) { inputBasedOnValues = s1.split("\\/"); int j = 1; while (compareValues(pathinfoArr[j], inputBasedOnValues[j])) { j++; if (j >= arrLength) { break; } } if (j == arrLength) { setBasedValues.add(s1); } } String input = ""; if (setBasedValues.size() == 1) { for (String s2 : setBasedValues) { input = s2; } } else { for (String s2 : setBasedValues) { if (pathMatcher.match(pathInfo, s2)) { input = s2; } } } if (input.isEmpty()) { input = (String) setBasedCharMap.get(globalvalue); } return "/" + componentType + input; }
From source file:bookChapter.theoretical.AnalyzeTheoreticalMSMSCalculation.java
private static String printInfo(HashSet<Identify> theBestAndromedaResults, HashSet<Identify> theBestSEQUESTResults) { /// bw.write("Score" + "\t" + "LevenshteinDistance" + "\t" + "Sequence" + "\t" + "Charge" + "\t" + "Correct" + "\t" + "Spec" + "\n"); String result = ""; HashMap<String, Integer> results_and_levDist = new HashMap<String, Integer>(); for (Identify andromeda : theBestAndromedaResults) { for (Identify sequest : theBestSEQUESTResults) { double observedMass = CalculateObservedMass.calculateMass(andromeda.getSpectrum()); int levenshteinDistance = StringUtils.getLevenshteinDistance(andromeda.getPeptide().getSequence(), sequest.getPeptide().getSequence()); result = andromeda.getSpectrum().getSpectrumTitle() + "\t" + andromeda.getSpectrum().getPrecursor().getMz() + "\t" + andromeda.getSpectrum().getPrecursor().getPossibleChargesAsString() + "\t" + observedMass + "\t" + andromeda.getScore() + "\t" + sequest.getScore() + "\t" + andromeda.getPeptide().getSequence() + "\t" + sequest.getPeptide().getSequence() + "\t" + levenshteinDistance + "\t" + sequest.getTotalScoredPeps() + "\t" + andromeda.isIsCorrectMatch() + "\t" + sequest.isIsCorrectMatch() + "\n"; results_and_levDist.put(result, levenshteinDistance); }// w ww.j a v a 2 s. com } int maxDist = Integer.MAX_VALUE, dist = 0; for (String res : results_and_levDist.keySet()) { dist = results_and_levDist.get(res); if (dist < maxDist) { result = res; maxDist = dist; } } return result; }
From source file:com.example.app.support.address.SpellingCorrector.java
private static float getNormalizedSimilarity(String s, String t) { return 1f - StringUtils.getLevenshteinDistance(s, t) / (float) Math.max(s.length(), t.length()); }
From source file:eu.scape_project.bitwiser.utils.SSDeep.java
static int scoreStrings(char[] s1, char[] s2, int blockSize) { int score = 0; int len1, len2; len1 = s1.length;//from w w w .j a v a 2 s . c om len2 = s2.length; if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) { /* not a real spamsum signature? */ return 0; } /* the two strings must have a common substring of length ROLLING_WINDOW to be candidates */ /* if (has_common_substring(s1, s2) == 0) { return 0; } */ /* compute the edit distance between the two strings. The edit distance gives us a pretty good idea of how closely related the two strings are */ score = StringUtils.getLevenshteinDistance(new String(s1), new String(s2)); /* scale the edit distance by the lengths of the two strings. This changes the score to be a measure of the proportion of the message that has changed rather than an absolute quantity. It also copes with the variability of the string lengths. */ score = (score * SPAMSUM_LENGTH) / (len1 + len2); /* at this stage the score occurs roughly on a 0-64 scale, * with 0 being a good match and 64 being a complete * mismatch */ /* rescale to a 0-100 scale (friendlier to humans) */ score = (100 * score) / 64; /* it is possible to get a score above 100 here, but it is a really terrible match */ if (score >= 100) { return 0; } /* now re-scale on a 0-100 scale with 0 being a poor match and 100 being a excellent match. */ score = 100 - score; // printf ("len1: %"PRIu32" len2: %"PRIu32"\n", len1, len2); /* when the blocksize is small we don't want to exaggerate the match size */ if (score > blockSize / MIN_BLOCKSIZE * Math.min(len1, len2)) { score = blockSize / MIN_BLOCKSIZE * Math.min(len1, len2); } return score; }
From source file:com.github.bfour.fpliteraturecollector.service.DefaultLiteratureService.java
/** * Determines whether a pair of two Literatures maybe is a duplicate pair. * Employs comparative measures that might lead to false positives. * //from w w w. j av a2 s. c o m * @param litA * @param litB * @return */ private boolean isProbableDuplicate(Literature litA, Literature litB) { if (isCertainDuplicate(litA, litB)) return true; // 1 character different for every 14 characters if (StringUtils.getLevenshteinDistance( Normalizer.normalize(litA.getTitle(), Normalizer.Form.NFD).toLowerCase(), Normalizer.normalize(litB.getTitle(), Normalizer.Form.NFD) .toLowerCase()) <= (litA.getTitle().length() / 14)) return true; if (litA.getISBN() != null && litB.getISBN() != null && litA.getISBN().equals(litB.getISBN())) return true; return false; }
From source file:de.janrieke.contractmanager.ext.hibiscus.UmsatzImportListDialog.java
private float getRelativeLevenshteinDistance(String a, String b) { int distance = StringUtils.getLevenshteinDistance(a.toLowerCase(), b.toLowerCase()); int maxLength = Math.max(a.length(), b.length()); int minLength = Math.min(a.length(), b.length()); int maxDistance = maxLength; int minDistance = maxLength - minLength; return (float) Math.sqrt(Math.sqrt((((float) (distance - minDistance)) / ((float) maxDistance)))); }
From source file:com.adobe.acs.commons.mcp.impl.processes.asset.UrlAssetImport.java
private int compareName(FileOrRendition a, FileOrRendition b, String fileName) { int aDist = StringUtils.getLevenshteinDistance(a.getName().toLowerCase(), fileName); int bDist = StringUtils.getLevenshteinDistance(b.getName().toLowerCase(), fileName); return Integer.compare(aDist, bDist); }
From source file:edu.ku.brc.specify.toycode.mexconabio.MexConvToSQL.java
/** * @param type/* w ww .java2s . c om*/ * @param rs * @param tblWriter * @param doingMapTable * @throws SQLException */ public void process(final int type, final ResultSet rs, final TableWriter tblWriter, final boolean doingMapTable) throws SQLException { Integer idEjemplar = rs.getInt(1); String catNum = rs.getString(2).trim(); String collectorNum = rs.getString(3).trim(); String collector = rs.getString(4); String genus = rs.getString(9); String species = rs.getString(10); String locality = rs.getString(5); int year = rs.getInt(11); int mon = rs.getInt(12); int day = rs.getInt(13); boolean hasDate = year > 0 && mon > 0 && day > 0; String dateStr = hasDate ? String.format("%04d-%02d-%02d", year, mon, day) : " "; String where = null; switch (type) { case 0: where = String.format( "FROM conabio WHERE GenusName = '%s' AND SpeciesName = '%s' AND CollNr = '%s' AND BarCD = '%s'", genus, species, collectorNum, catNum); break; case 1: where = String.format("FROM conabio WHERE CollNr = '%s' AND BarCD = '%s'", collectorNum, catNum); break; case 2: where = String.format("FROM conabio WHERE CollNr = '%s' AND BarCD <> '%s'", collectorNum, catNum); break; case 3: where = String.format("FROM conabio WHERE CollNr <> '%s' AND BarCD = '%s'", collectorNum, catNum); break; } String sql2 = "SELECT COUNT(*) " + where; int cnt = BasicSQLUtils.getCountAsInt(conn, sql2); if (cnt == 1) { for (int i = 0; i < cCls.length; i++) { cCls[i] = null; mCls[i] = null; } if (doingMapTable) { sql2 = "SELECT ID " + where; int id = BasicSQLUtils.getCountAsInt(conn, sql2); mpStmt.setInt(1, idEjemplar); mpStmt.setInt(2, id); mpStmt.executeUpdate(); } sql2 = "SELECT BarCD, CollNr, GenusName, SpeciesName, Collectoragent1, LocalityName, Datecollstandrd " + where; Vector<Object[]> rows = BasicSQLUtils.query(conn, sql2); Object[] cols = rows.get(0); Date date = (Date) cols[6]; boolean michHasDate = date != null; String michDate = michHasDate ? sdf.format(date) : " "; Integer michCatNumInt = (Integer) cols[0]; String michCatNum = michCatNumInt.toString(); String michColNum = ((String) cols[1]).trim(); String michGenus = ((String) cols[2]).trim(); String michSpecies = (String) cols[3]; String michCollector = (String) cols[4]; String michLocality = (String) cols[5]; int michYear = 0; int michMon = 0; int michDay = 0; if (date != null) { cal.setTime(date); michYear = michHasDate ? cal.get(Calendar.YEAR) : 0; michMon = michHasDate ? cal.get(Calendar.MONTH) + 1 : 0; michDay = michHasDate ? cal.get(Calendar.DATE) : 0; } int maxScore = 115; int score = 0; if (hasDate && michHasDate) { score += year == michYear ? 10 : 0; score += mon == michMon ? 20 : 0; score += day == michDay ? 30 : 0; if (year == michYear && mon == michMon && day == michDay && year != 0 && mon != 0 && day != 0) { cCls[6] = BGR; mCls[6] = BGR; } else if (year == michYear && mon == michMon && year != 0 && mon != 0) { cCls[6] = GR; mCls[6] = GR; } else if (year == michYear) { cCls[6] = YW; mCls[6] = YW; } } double ratingLoc = check(locality, michLocality, false); double ratingColtr = check(collector, michCollector, false); if (ratingLoc > 50.0) { cCls[5] = BGR; mCls[5] = BGR; score += 10; } else if (ratingLoc > 30.0) { cCls[5] = GR; mCls[5] = GR; score += 6; } else if (ratingLoc > 0.0) { cCls[5] = YW; mCls[5] = YW; score += 3; } if (ratingColtr > 50.0) { cCls[4] = BGR; mCls[4] = BGR; score += 10; } else if (ratingColtr > 30.0) { cCls[4] = GR; mCls[4] = GR; score += 6; } else if (ratingColtr > 0.0) { cCls[4] = YW; mCls[4] = YW; score += 3; } boolean genusMatches = false; if (michGenus != null && genus != null) { if (michGenus.equals(genus)) { score += 15; cCls[2] = GR; mCls[2] = GR; genusMatches = true; } else if (StringUtils.getLevenshteinDistance(genus, michGenus) < 3) { score += 7; cCls[2] = YW; mCls[2] = YW; } } if (michSpecies != null && species != null) { if (michSpecies.equals(species)) { score += 20; if (genusMatches) { cCls[2] = BGR; mCls[2] = BGR; cCls[3] = BGR; mCls[3] = BGR; } else { cCls[3] = GR; mCls[3] = GR; } } else if (StringUtils.getLevenshteinDistance(species, michSpecies) < 3) { score += 10; cCls[3] = YW; mCls[3] = YW; } } if (michGenus != null && species != null && michGenus.equals(species)) { cCls[3] = DF; mCls[2] = DF; score += 10; } if (michSpecies != null && genus != null && michSpecies.equals(genus)) { cCls[2] = DF; mCls[3] = DF; score += 15; } if (catNum.equals(michCatNum)) { cCls[1] = BGR; mCls[1] = BGR; } if (collectorNum.equals(michColNum)) { cCls[0] = BGR; mCls[0] = BGR; } int finalScore = (int) ((((double) score / maxScore) * 100.0) + 0.5); histo[finalScore]++; totalScore += finalScore; String scoreStr = String.format("%d", finalScore); tblWriter.println("<TR>"); tblWriter.logTDCls(cCls[0], collectorNum); tblWriter.logTDCls(cCls[1], catNum); tblWriter.logTDCls(cCls[2], genus); tblWriter.logTDCls(cCls[3], species); tblWriter.logTDCls(cCls[4], collector); tblWriter.logTDCls(cCls[5], locality); tblWriter.logTDCls(cCls[6], dateStr); tblWriter.logTDCls(cCls[0], michColNum); tblWriter.logTDCls(cCls[1], michCatNum != null ? michCatNum.toString() : " "); tblWriter.logTDCls(mCls[2], michGenus); tblWriter.logTDCls(mCls[3], michSpecies); tblWriter.logTDCls(mCls[4], michCollector); tblWriter.logTDCls(mCls[5], michLocality); tblWriter.logTDCls(mCls[6], michDate); tblWriter.logTDCls(null, scoreStr); tblWriter.println("</TR>"); fndCnt++; System.out.println("Fnd: " + fndCnt + " Num Recs: " + numRecs + " Dif: " + (numRecs - fndCnt)); } numRecs++; }