List of usage examples for org.apache.commons.lang3 StringUtils getLevenshteinDistance
public static int getLevenshteinDistance(CharSequence s, CharSequence t, final int threshold)
From source file:gov.nyc.doitt.gis.geoclient.service.search.policy.DefaultSimilarNamesPolicy.java
@Override public boolean isSimilarName(String original, String proposed) { return StringUtils.getLevenshteinDistance(clean(original), clean(proposed), similarNamesDistance) >= 0; }
From source file:kenh.expl.functions.GetLevenshteinDistance.java
public int process(String str1, String str2, int threshold) { if (threshold < 0) threshold = 0;//www . j a v a2 s .com try { return StringUtils.getLevenshteinDistance(str1, str2, threshold); } catch (Exception e) { return 0; } }
From source file:de.knowwe.compile.correction.IncrementalTermReferenceCorrectionProvider.java
@Override public List<Suggestion> getSuggestions(TermCompiler compiler, Section<?> section, int threshold) { List<Suggestion> suggestions = new LinkedList<>(); if (!(section.get() instanceof SimpleReference)) { return suggestions; }// www . jav a2s . co m ReferenceManager terminology = IncrementalCompiler.getInstance().getTerminology(); if (terminology.isValid(new Identifier(section.getText()))) { // if reference is valid, no correction is proposed return suggestions; } Collection<Section<? extends SimpleDefinition>> defs = terminology.getAllTermDefinitions(); String originalText = section.getText(); for (Section<? extends SimpleDefinition> def : defs) { Identifier termIdentifier = KnowWEUtils.getTermIdentifier(def); String termIdentifierElement = termIdentifier.getLastPathElement(); String originalTextRegex = originalText.replace(" ", ".*"); /* levenstein test */ double score = StringUtils.getLevenshteinDistance(originalText, termIdentifierElement, threshold); if (score >= 0) { suggestions.add(new DefaultSuggestion(termIdentifierElement, (int) score)); } /* infix test */ else if (termIdentifierElement.matches(".*" + originalTextRegex + ".*")) { int infixScore = termIdentifierElement.length() - originalText.length(); suggestions.add(new DefaultSuggestion(termIdentifierElement, infixScore)); } } return suggestions; }
From source file:knowledgeMiner.mining.wikipedia.CategoryMembershipMiner.java
@Override protected void mineArticleInternal(MinedInformation info, int informationRequested, WMISocket wmi, OntologySocket ontology) throws Exception { int artID = info.getArticle(); String artTitle = wmi.getPageTitle(artID, true); Collection<Integer> categories = wmi.getArticleCategories(artID); for (Integer category : categories) { String categoryTitle = wmi.getPageTitle(category, true); // Remove the word 'stub(s)' categoryTitle = categoryTitle.replaceAll(" stubs?", ""); if (categoryTitle.equals(artTitle)) continue; // Special category parsing if (parseSpecial(categoryTitle, info, ontology, wmi)) continue; // Check article title similarity int result = StringUtils.getLevenshteinDistance(artTitle, categoryTitle, 3); if (result != -1) continue; // Treat the category as a chunk of text to be parsed String sentence = SentenceParserHeuristic.SENTENCE_PREFIX + categoryTitle + "."; miner_.mineSentence(sentence, false, info, this, ontology, wmi); }//from w w w.j ava 2 s. c om }
From source file:com.stratio.crossdata.core.utils.ParserUtils.java
/** * Get the best matches for a string {@code str} given a set of words to compare against and a maximum Levenshtein * distance.// ww w . j ava 2s . c om * * @param str The word to get the matches for. * @param words The set of candidate words. * @param maxDistance The maximum Levenshtein distance. * @return A set of matching words within distance. */ public static Set<String> getBestMatches(String str, Set<String> words, int maxDistance) { int limit = maxDistance + 1; int currentLimit = 1; Set<String> result = new HashSet<>(); while (result.isEmpty() && currentLimit < limit) { for (String word : words) { int distance = StringUtils.getLevenshteinDistance(str, word, maxDistance); if ((distance > -1) && (distance < currentLimit)) { result.add(word); } } currentLimit++; } return result; }
From source file:at.jps.sanction.core.util.TokenTool.java
/** * compare single token with list of tokens no modification of tokens is done * * @param text1//from ww w . j a v a2 s . c om * @param text2 * @param fuzzy * ( if true) LevenshteinDistance is used * @param minlen * @param fuzzyValue * ( 20 - 80 % ) * @return Percentage of of equality ( 0 - 100 %) */ public static int compareCheck(final String text1, final String text2, final boolean fuzzy, final int minlen, final double fuzzyValue) { int deltaValue = text1.length(); int minWordLen = deltaValue; int percentHitrate = 0; // +/- fuzzy string compare // only compare meaningfull !! deltalength > 80% // TODO: calc length to fuzzyness if (((text1.length() >= minlen) && (text2.length() >= minlen)) && (Math.abs(text1.length() - text2.length()) <= minlen)) { if (fuzzy) { minWordLen = Math.min(text1.length(), text2.length()); final int threshold = (int) (minWordLen * fuzzyValue) + 1; deltaValue = StringUtils.getLevenshteinDistance(text1, text2, threshold); if (deltaValue == -1) // threshold cutoff { deltaValue = minWordLen; } } else { if (text1.equalsIgnoreCase(text2)) { // TODO: ignoreCase ? deltaValue = 0; } } } percentHitrate = (int) ((1 - ((double) deltaValue / minWordLen)) * 100); // percentHitrate = 100 - ((100 / ((float) (text1.length()))) * deltaValue); // percentHitrate = 100 - ((100 / minWordLen) * deltaValue); return percentHitrate; }
From source file:org.easotope.client.rawdata.batchimport.ThreadedFileReader.java
private void processFile(File file, boolean fileWasFound) { byte[] fileBytes = null; try {/*from ww w. j a v a 2s .c o m*/ fileBytes = FileReader.getBytesFromFile(file.getCanonicalPath()); } catch (Exception e) { batchImportComposite.getDisplay().asyncExec(() -> { if (!batchImportComposite.isDisposed()) { Shell shell = batchImportComposite.getParent().getShell(); MessageDialog.openError(shell, Messages.threadedFileReader_fileAddErrorTitle, e.getMessage()); } }); return; } RawFile rawFile = new RawFile(); rawFile.setOriginalName(file.getName()); ComputeAcquisitionParsed computeAcquisitionParsed = null; try { computeAcquisitionParsed = new ComputeAcquisitionParsed(rawFile, fileBytes, false, null); } catch (RuntimeException e) { if (!fileWasFound) { batchImportComposite.getDisplay().asyncExec(() -> { if (!batchImportComposite.isDisposed()) { Shell shell = batchImportComposite.getParent().getShell(); MessageDialog.openError(shell, Messages.threadedFileReader_fileAddErrorTitle, e.getMessage()); } }); } return; } if (assumedTimeZone == null) { assumedTimeZone = computeAcquisitionParsed.getAssumedTimeZone(); } int acquisitionNumber = 0; for (AcquisitionParsedV2 acquisitionParsed : computeAcquisitionParsed.getMaps()) { ImportedFile importedFile = new ImportedFile(); importedFile.setAssumedTimeZone(computeAcquisitionParsed.getAssumedTimeZone() != null); importedFile.setTimestamp(acquisitionParsed.getDate()); importedFile.setAcquisitionNumber(acquisitionNumber++); importedFile.setFileName(file.getName()); String id1 = (String) acquisitionParsed.getMisc().get(InputParameter.Identifier_1); String id2 = (String) acquisitionParsed.getMisc().get(InputParameter.Identifier_2); String sampleName = (String) acquisitionParsed.getMisc().get(InputParameter.Sample_Name); if (id1 != null || id2 != null) { importedFile.setIdentifier1(id1); importedFile.setIdentifier2(id2); } else { importedFile.setIdentifier1(sampleName); } String firstString = importedFile.getIdentifier1(); int firstStringFactor = 1; if (firstString != null && firstString.length() != 0) { firstString = firstString.toUpperCase(); firstStringFactor = firstString.length(); } String secondString = importedFile.getIdentifier2(); int secondStringFactor = 1; if (secondString != null && secondString.length() != 0) { secondString = secondString.toUpperCase(); secondStringFactor = secondString.length(); } ArrayList<SourceListItem> sourceList = null; if (globalSourceList != null) { ArrayList<SourceListItemSorter> comboItems = new ArrayList<SourceListItemSorter>(); for (SourceListItem sourceListItem : globalSourceList) { int levenshteinDistance = -1; SourceListItemSorter sorter = null; if (firstString != null && firstString.length() != 0) { int distance = StringUtils.getLevenshteinDistance(firstString, sourceListItem.getSourceNameToUpper(), MAX_LEVENSHTEIN_DISTANCE) * secondStringFactor; if (distance >= 0 && (levenshteinDistance == -1 || distance < levenshteinDistance)) { levenshteinDistance = distance; sorter = new SourceListItemSorter(levenshteinDistance, sourceListItem); } } if (secondString != null && secondString.length() != 0) { int distance = StringUtils.getLevenshteinDistance(secondString, sourceListItem.getSourceNameToUpper(), MAX_LEVENSHTEIN_DISTANCE) * firstStringFactor; if (distance >= 0 && (levenshteinDistance == -1 || distance < levenshteinDistance)) { levenshteinDistance = distance; sorter = new SourceListItemSorter(levenshteinDistance, sourceListItem); } } if (levenshteinDistance >= 0) { comboItems.add(sorter); } } Collections.sort(comboItems); sourceList = new ArrayList<SourceListItem>(); for (SourceListItemSorter sourceListItemSorter : comboItems) { sourceList.add(sourceListItemSorter.getSourceListItem()); if (sourceList.size() == MAX_COMBO_SIZE) { break; } } } importedFile.setSourceList(sourceList); if (sourceList.size() != 0) { importedFile.setSampleId(sourceList.get(0).getSampleId()); importedFile.setStandardId(sourceList.get(0).getStandardId()); } else { importedFile.setSampleId(DatabaseConstants.EMPTY_DB_ID); importedFile.setStandardId(DatabaseConstants.EMPTY_DB_ID); } batchImportComposite.getDisplay().asyncExec(() -> { if (!batchImportComposite.isDisposed()) { batchImportComposite.addImportedFile(importedFile); } }); } }
From source file:org.jahia.tools.maven.plugins.LegalArtifactAggregator.java
/** * Find the closest matching license using a LevenshteinDistance edit distance algorithm because the two license * texts. If the edit distance is larger than the EDIT_DISTANCE_THRESHOLD it is possible that no license matches, * which is what we want if we are actually not matching a real license. * @param licenseFile the license we want to match against the known licenses. * @return//from w w w. j av a 2s . c om */ public KnownLicense findClosestMatchingKnownLicense(LicenseFile licenseFile) { KnownLicense closestMatchingKnownLicense = null; int smallestEditDistance = Integer.MAX_VALUE; for (KnownLicense knownLicense : knownLicenses.getLicenses().values()) { for (TextVariant textVariant : knownLicense.getTextVariants()) { int editDistance = StringUtils.getLevenshteinDistance(textVariant.getText(), licenseFile.getText(), EDIT_DISTANCE_THRESHOLD); if (editDistance >= 0 && editDistance < smallestEditDistance) { smallestEditDistance = editDistance; closestMatchingKnownLicense = knownLicense; } } } return closestMatchingKnownLicense; }
From source file:qa.experiment.ProcessFeatureVector.java
public int isNameFuzzyMatch(String[] str1, String[] str2) { for (int i = 0; i < str1.length; i++) { for (int j = 0; j < str2.length; j++) { int dist = StringUtils.getLevenshteinDistance(str1[i].trim(), str2[j].trim(), 3); if (dist != -1) { String prefix = str1[i].substring(0, 3); if (str2[j].trim().startsWith(prefix)) { return i; }/*w w w . j a va 2s. co m*/ } } } return -1; }