Example usage for org.apache.commons.text.similarity SimilarityScore apply

List of usage examples for org.apache.commons.text.similarity SimilarityScore apply

Introduction

In this page you can find the example usage for org.apache.commons.text.similarity SimilarityScore apply.

Prototype

R apply(CharSequence left, CharSequence right);

Source Link

Document

Compares two CharSequences.

Usage

From source file:org.languagetool.rules.spelling.suggestions.SuggestionsOrdererFeatureExtractor.java

/**
 * compute features for training or prediction of a ranking model for suggestions
 * @param suggestions//from  w w w.  j  a v a  2s  .  c  o m
 * @param word
 * @param sentence
 * @param startPos
 * @return correction candidates, features for the match in general, features specific to candidates
 */
public Pair<List<SuggestedReplacement>, SortedMap<String, Float>> computeFeatures(List<String> suggestions,
        String word, AnalyzedSentence sentence, int startPos) {
    if (suggestions.isEmpty()) {
        return Pair.of(Collections.emptyList(), Collections.emptySortedMap());
    }
    if (topN <= 0) {
        topN = suggestions.size();
    }
    List<String> topSuggestions = suggestions.subList(0, Math.min(suggestions.size(), topN));
    //EditDistance<Integer> levenshteinDistance = new LevenshteinDistance(4);
    EditDistance levenstheinDistance = new EditDistance(word, EditDistance.DistanceAlgorithm.Damerau);
    SimilarityScore<Double> jaroWrinklerDistance = new JaroWinklerDistance();
    List<Feature> features = new ArrayList<>(topSuggestions.size());

    for (String candidate : topSuggestions) {
        double prob1 = languageModel.getPseudoProbability(Collections.singletonList(candidate)).getProb();
        double prob3 = LanguageModelUtils.get3gramProbabilityFor(language, languageModel, startPos, sentence,
                candidate);
        //double prob4 = LanguageModelUtils.get4gramProbabilityFor(language, languageModel, startPos, sentence, candidate);
        long wordCount = ((BaseLanguageModel) languageModel).getCount(candidate);
        int levenstheinDist = levenstheinDistance.compare(candidate, 3);
        double jaroWrinklerDist = jaroWrinklerDistance.apply(word, candidate);
        DetailedDamerauLevenstheinDistance.Distance detailedDistance = DetailedDamerauLevenstheinDistance
                .compare(word, candidate);

        features.add(new Feature(prob1, prob3, wordCount, levenstheinDist, detailedDistance, jaroWrinklerDist,
                candidate));
    }
    if (!"noop".equals(score)) {
        features.sort(Feature::compareTo);
    }
    //logger.trace("Features for '%s' in '%s': %n", word, sentence.getText());
    //features.stream().map(Feature::toString).forEach(logger::trace);
    List<String> words = features.stream().map(Feature::getWord).collect(Collectors.toList());

    // compute general features, not tied to candidates
    SortedMap<String, Float> matchData = new TreeMap<>();
    matchData.put("candidateCount", (float) words.size());

    List<SuggestedReplacement> suggestionsData = features.stream().map(f -> {
        SuggestedReplacement s = new SuggestedReplacement(f.getWord());
        s.setFeatures(f.getData());
        return s;
    }).collect(Collectors.toList());
    return Pair.of(suggestionsData, matchData);
}