Example usage for org.apache.commons.text.similarity JaroWinklerDistance JaroWinklerDistance

List of usage examples for org.apache.commons.text.similarity JaroWinklerDistance JaroWinklerDistance

Introduction

In this page you can find the example usage for org.apache.commons.text.similarity JaroWinklerDistance JaroWinklerDistance.

Prototype

JaroWinklerDistance

Source Link

Usage

From source file:org.languagetool.rules.spelling.suggestions.SuggestionsOrdererFeatureExtractor.java

/**
 * compute features for training or prediction of a ranking model for suggestions
 * @param suggestions//  www .  ja  va  2  s . co m
 * @param word
 * @param sentence
 * @param startPos
 * @return correction candidates, features for the match in general, features specific to candidates
 */
public Pair<List<SuggestedReplacement>, SortedMap<String, Float>> computeFeatures(List<String> suggestions,
        String word, AnalyzedSentence sentence, int startPos) {
    if (suggestions.isEmpty()) {
        return Pair.of(Collections.emptyList(), Collections.emptySortedMap());
    }
    if (topN <= 0) {
        topN = suggestions.size();
    }
    List<String> topSuggestions = suggestions.subList(0, Math.min(suggestions.size(), topN));
    //EditDistance<Integer> levenshteinDistance = new LevenshteinDistance(4);
    EditDistance levenstheinDistance = new EditDistance(word, EditDistance.DistanceAlgorithm.Damerau);
    SimilarityScore<Double> jaroWrinklerDistance = new JaroWinklerDistance();
    List<Feature> features = new ArrayList<>(topSuggestions.size());

    for (String candidate : topSuggestions) {
        double prob1 = languageModel.getPseudoProbability(Collections.singletonList(candidate)).getProb();
        double prob3 = LanguageModelUtils.get3gramProbabilityFor(language, languageModel, startPos, sentence,
                candidate);
        //double prob4 = LanguageModelUtils.get4gramProbabilityFor(language, languageModel, startPos, sentence, candidate);
        long wordCount = ((BaseLanguageModel) languageModel).getCount(candidate);
        int levenstheinDist = levenstheinDistance.compare(candidate, 3);
        double jaroWrinklerDist = jaroWrinklerDistance.apply(word, candidate);
        DetailedDamerauLevenstheinDistance.Distance detailedDistance = DetailedDamerauLevenstheinDistance
                .compare(word, candidate);

        features.add(new Feature(prob1, prob3, wordCount, levenstheinDist, detailedDistance, jaroWrinklerDist,
                candidate));
    }
    if (!"noop".equals(score)) {
        features.sort(Feature::compareTo);
    }
    //logger.trace("Features for '%s' in '%s': %n", word, sentence.getText());
    //features.stream().map(Feature::toString).forEach(logger::trace);
    List<String> words = features.stream().map(Feature::getWord).collect(Collectors.toList());

    // compute general features, not tied to candidates
    SortedMap<String, Float> matchData = new TreeMap<>();
    matchData.put("candidateCount", (float) words.size());

    List<SuggestedReplacement> suggestionsData = features.stream().map(f -> {
        SuggestedReplacement s = new SuggestedReplacement(f.getWord());
        s.setFeatures(f.getData());
        return s;
    }).collect(Collectors.toList());
    return Pair.of(suggestionsData, matchData);
}