org.apache.lucene.search.spell.WordBreakSpellChecker.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.spell.WordBreakSpellChecker.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.spell;

import java.io.IOException;
import java.util.Comparator;
import java.util.PriorityQueue;
import java.util.Queue;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;

/**
 * <p>
 * A spell checker whose sole function is to offer suggestions by combining
 * multiple terms into one word and/or breaking terms into multiple words.
 * </p>
 */
public class WordBreakSpellChecker {
    private int minSuggestionFrequency = 1;
    private int minBreakWordLength = 1;
    private int maxCombineWordLength = 20;
    private int maxChanges = 1;
    private int maxEvaluations = 1000;

    /** Term that can be used to prohibit adjacent terms from being combined */
    public static final Term SEPARATOR_TERM = new Term("", "");

    /** 
     * Creates a new spellchecker with default configuration values
     * @see #setMaxChanges(int)
     * @see #setMaxCombineWordLength(int)
     * @see #setMaxEvaluations(int)
     * @see #setMinBreakWordLength(int)
     * @see #setMinSuggestionFrequency(int)
     */
    public WordBreakSpellChecker() {
    }

    /**
     * <p>
     * Determines the order to list word break suggestions
     * </p>
     */
    public enum BreakSuggestionSortMethod {
        /**
         * <p>
         * Sort by Number of word breaks, then by the Sum of all the component
         * term's frequencies
         * </p>
         */
        NUM_CHANGES_THEN_SUMMED_FREQUENCY,
        /**
         * <p>
         * Sort by Number of word breaks, then by the Maximum of all the component
         * term's frequencies
         * </p>
         */
        NUM_CHANGES_THEN_MAX_FREQUENCY
    }

    /**
     * <p>
     * Generate suggestions by breaking the passed-in term into multiple words.
     * The scores returned are equal to the number of word breaks needed so a
     * lower score is generally preferred over a higher score.
     * </p>
     * 
     * @param suggestMode
     *          - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
     * @param sortMethod
     *          - default =
     *          {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
     * @return one or more arrays of words formed by breaking up the original term
     * @throws IOException If there is a low-level I/O error.
     */
    public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions, IndexReader ir, SuggestMode suggestMode,
            BreakSuggestionSortMethod sortMethod) throws IOException {
        if (maxSuggestions < 1) {
            return new SuggestWord[0][0];
        }
        if (suggestMode == null) {
            suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
        }
        if (sortMethod == null) {
            sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
        }

        int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
        Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY
                ? new LengthThenMaxFreqComparator()
                : new LengthThenSumFreqComparator();
        Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<>(queueInitialCapacity, queueComparator);

        int origFreq = ir.docFreq(term);
        if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
            return new SuggestWord[0][];
        }

        int useMinSuggestionFrequency = minSuggestionFrequency;
        if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
            useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
        }

        generateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, new SuggestWord[0],
                suggestions, 0, sortMethod);

        SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
        for (int i = suggestions.size() - 1; i >= 0; i--) {
            suggestionArray[i] = suggestions.remove().suggestWords;
        }

        return suggestionArray;
    }

    /**
     * <p>
     * Generate suggestions by combining one or more of the passed-in terms into
     * single words. The returned {@link CombineSuggestion} contains both a
     * {@link SuggestWord} and also an array detailing which passed-in terms were
     * involved in creating this combination. The scores returned are equal to the
     * number of word combinations needed, also one less than the length of the
     * array {@link CombineSuggestion#originalTermIndexes}. Generally, a
     * suggestion with a lower score is preferred over a higher score.
     * </p>
     * <p>
     * To prevent two adjacent terms from being combined (for instance, if one is
     * mandatory and the other is prohibited), separate the two terms with
     * {@link WordBreakSpellChecker#SEPARATOR_TERM}
     * </p>
     * <p>
     * When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
     * suggestion will include at least one term not in the index.
     * </p>
     * <p>
     * When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
     * suggestion will have the same, or better frequency than the most-popular
     * included term.
     * </p>
     * 
     * @return an array of words generated by combining original terms
     * @throws IOException If there is a low-level I/O error.
     */
    public CombineSuggestion[] suggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir,
            SuggestMode suggestMode) throws IOException {
        if (maxSuggestions < 1) {
            return new CombineSuggestion[0];
        }

        int[] origFreqs = null;
        if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
            origFreqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                origFreqs[i] = ir.docFreq(terms[i]);
            }
        }

        int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
        Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
        Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<>(queueInitialCapacity, queueComparator);

        int thisTimeEvaluations = 0;
        for (int i = 0; i < terms.length - 1; i++) {
            if (terms[i].equals(SEPARATOR_TERM)) {
                continue;
            }
            String leftTermText = terms[i].text();
            int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
            if (leftTermLength > maxCombineWordLength) {
                continue;
            }
            int maxFreq = 0;
            int minFreq = Integer.MAX_VALUE;
            if (origFreqs != null) {
                maxFreq = origFreqs[i];
                minFreq = origFreqs[i];
            }
            String combinedTermText = leftTermText;
            int combinedLength = leftTermLength;
            for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
                if (terms[j].equals(SEPARATOR_TERM)) {
                    break;
                }
                String rightTermText = terms[j].text();
                int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
                combinedTermText += rightTermText;
                combinedLength += rightTermLength;
                if (combinedLength > maxCombineWordLength) {
                    break;
                }

                if (origFreqs != null) {
                    maxFreq = Math.max(maxFreq, origFreqs[j]);
                    minFreq = Math.min(minFreq, origFreqs[j]);
                }

                Term combinedTerm = new Term(terms[0].field(), combinedTermText);
                int combinedTermFreq = ir.docFreq(combinedTerm);

                if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq) {
                    if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0) {
                        if (combinedTermFreq >= minSuggestionFrequency) {
                            int[] origIndexes = new int[j - i + 1];
                            origIndexes[0] = i;
                            for (int k = 1; k < origIndexes.length; k++) {
                                origIndexes[k] = i + k;
                            }
                            SuggestWord word = new SuggestWord();
                            word.freq = combinedTermFreq;
                            word.score = origIndexes.length - 1;
                            word.string = combinedTerm.text();
                            CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
                                    new CombineSuggestion(word, origIndexes), (origIndexes.length - 1));
                            suggestions.offer(suggestion);
                            if (suggestions.size() > maxSuggestions) {
                                suggestions.poll();
                            }
                        }
                    }
                }
                thisTimeEvaluations++;
                if (thisTimeEvaluations == maxEvaluations) {
                    break;
                }
            }
        }
        CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.size()];
        for (int i = suggestions.size() - 1; i >= 0; i--) {
            combineSuggestions[i] = suggestions.remove().combineSuggestion;
        }
        return combineSuggestions;
    }

    private int generateBreakUpSuggestions(Term term, IndexReader ir, int numberBreaks, int maxSuggestions,
            int useMinSuggestionFrequency, SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
            int totalEvaluations, BreakSuggestionSortMethod sortMethod) throws IOException {
        String termText = term.text();
        int termLength = termText.codePointCount(0, termText.length());
        int useMinBreakWordLength = minBreakWordLength;
        if (useMinBreakWordLength < 1) {
            useMinBreakWordLength = 1;
        }
        if (termLength < (useMinBreakWordLength * 2)) {
            return 0;
        }

        int thisTimeEvaluations = 0;
        for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
            int end = termText.offsetByCodePoints(0, i);
            String leftText = termText.substring(0, end);
            String rightText = termText.substring(end);
            SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);

            if (leftWord.freq >= useMinSuggestionFrequency) {
                SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
                if (rightWord.freq >= useMinSuggestionFrequency) {
                    SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
                            newSuggestion(prefix, leftWord, rightWord));
                    suggestions.offer(suggestion);
                    if (suggestions.size() > maxSuggestions) {
                        suggestions.poll();
                    }
                }
                int newNumberBreaks = numberBreaks + 1;
                if (newNumberBreaks <= maxChanges) {
                    int evaluations = generateBreakUpSuggestions(new Term(term.field(), rightWord.string), ir,
                            newNumberBreaks, maxSuggestions, useMinSuggestionFrequency, newPrefix(prefix, leftWord),
                            suggestions, totalEvaluations, sortMethod);
                    totalEvaluations += evaluations;
                }
            }

            thisTimeEvaluations++;
            totalEvaluations++;
            if (totalEvaluations >= maxEvaluations) {
                break;
            }
        }
        return thisTimeEvaluations;
    }

    private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
        SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
        System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
        newPrefix[newPrefix.length - 1] = append;
        return newPrefix;
    }

    private SuggestWord[] newSuggestion(SuggestWord[] prefix, SuggestWord append1, SuggestWord append2) {
        SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
        int score = prefix.length + 1;
        for (int i = 0; i < prefix.length; i++) {
            SuggestWord word = new SuggestWord();
            word.string = prefix[i].string;
            word.freq = prefix[i].freq;
            word.score = score;
            newSuggestion[i] = word;
        }
        append1.score = score;
        append2.score = score;
        newSuggestion[newSuggestion.length - 2] = append1;
        newSuggestion[newSuggestion.length - 1] = append2;
        return newSuggestion;
    }

    private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
        Term term = new Term(fieldname, text);
        int freq = ir.docFreq(term);
        SuggestWord word = new SuggestWord();
        word.freq = freq;
        word.score = 1;
        word.string = text;
        return word;
    }

    /**
     * Returns the minimum frequency a term must have
     * to be part of a suggestion.
     * @see #setMinSuggestionFrequency(int)
     */
    public int getMinSuggestionFrequency() {
        return minSuggestionFrequency;
    }

    /**
     * Returns the maximum length of a combined suggestion
     * @see #setMaxCombineWordLength(int)
     */
    public int getMaxCombineWordLength() {
        return maxCombineWordLength;
    }

    /**
     * Returns the minimum size of a broken word
     * @see #setMinBreakWordLength(int)
     */
    public int getMinBreakWordLength() {
        return minBreakWordLength;
    }

    /**
     * Returns the maximum number of changes to perform on the input
     * @see #setMaxChanges(int)
     */
    public int getMaxChanges() {
        return maxChanges;
    }

    /**
     * Returns the maximum number of word combinations to evaluate.
     * @see #setMaxEvaluations(int)
     */
    public int getMaxEvaluations() {
        return maxEvaluations;
    }

    /**
     * <p>
     * The minimum frequency a term must have to be included as part of a
     * suggestion. Default=1 Not applicable when used with
     * {@link SuggestMode#SUGGEST_MORE_POPULAR}
     * </p>
     * 
     * @see #getMinSuggestionFrequency()
     */
    public void setMinSuggestionFrequency(int minSuggestionFrequency) {
        this.minSuggestionFrequency = minSuggestionFrequency;
    }

    /**
     * <p>
     * The maximum length of a suggestion made by combining 1 or more original
     * terms. Default=20
     * </p>
     * 
     * @see #getMaxCombineWordLength()
     */
    public void setMaxCombineWordLength(int maxCombineWordLength) {
        this.maxCombineWordLength = maxCombineWordLength;
    }

    /**
     * <p>
     * The minimum length to break words down to. Default=1
     * </p>
     * 
     * @see #getMinBreakWordLength()
     */
    public void setMinBreakWordLength(int minBreakWordLength) {
        this.minBreakWordLength = minBreakWordLength;
    }

    /**
     * <p>
     * The maximum numbers of changes (word breaks or combinations) to make on the
     * original term(s). Default=1
     * </p>
     * 
     * @see #getMaxChanges()
     */
    public void setMaxChanges(int maxChanges) {
        this.maxChanges = maxChanges;
    }

    /**
     * <p>
     * The maximum number of word combinations to evaluate. Default=1000. A higher
     * value might improve result quality. A lower value might improve
     * performance.
     * </p>
     * 
     * @see #getMaxEvaluations()
     */
    public void setMaxEvaluations(int maxEvaluations) {
        this.maxEvaluations = maxEvaluations;
    }

    private static class LengthThenMaxFreqComparator implements Comparator<SuggestWordArrayWrapper> {
        @Override
        public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
            if (o1.suggestWords.length != o2.suggestWords.length) {
                return o2.suggestWords.length - o1.suggestWords.length;
            }
            if (o1.freqMax != o2.freqMax) {
                return o1.freqMax - o2.freqMax;
            }
            return 0;
        }
    }

    private static class LengthThenSumFreqComparator implements Comparator<SuggestWordArrayWrapper> {
        @Override
        public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
            if (o1.suggestWords.length != o2.suggestWords.length) {
                return o2.suggestWords.length - o1.suggestWords.length;
            }
            if (o1.freqSum != o2.freqSum) {
                return o1.freqSum - o2.freqSum;
            }
            return 0;
        }
    }

    private static class CombinationsThenFreqComparator implements Comparator<CombineSuggestionWrapper> {
        @Override
        public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
            if (o1.numCombinations != o2.numCombinations) {
                return o2.numCombinations - o1.numCombinations;
            }
            if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
                return o1.combineSuggestion.suggestion.freq - o2.combineSuggestion.suggestion.freq;
            }
            return 0;
        }
    }

    private static class SuggestWordArrayWrapper {
        final SuggestWord[] suggestWords;
        final int freqMax;
        final int freqSum;

        SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
            this.suggestWords = suggestWords;
            int aFreqSum = 0;
            int aFreqMax = 0;
            for (SuggestWord sw : suggestWords) {
                aFreqSum += sw.freq;
                aFreqMax = Math.max(aFreqMax, sw.freq);
            }
            this.freqSum = aFreqSum;
            this.freqMax = aFreqMax;
        }
    }

    private static class CombineSuggestionWrapper {
        final CombineSuggestion combineSuggestion;
        final int numCombinations;

        CombineSuggestionWrapper(CombineSuggestion combineSuggestion, int numCombinations) {
            this.combineSuggestion = combineSuggestion;
            this.numCombinations = numCombinations;
        }
    }
}