org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java Source code

Introduction

Here is the source code for org.dice.solrenhancements.spellchecker.DiceMultipleCaseSuggester.java
Source

package org.dice.solrenhancements.spellchecker;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SH: This doesn't do anything different to solr src it's currently just for testing the suggester functionality, so see why it's failing for
* certain scenarios.
*/

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.HighFrequencyDictionary;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.suggest.FileDictionary;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.spelling.SolrSpellChecker;
import org.apache.solr.spelling.SpellingOptions;
import org.apache.solr.spelling.SpellingResult;
import org.apache.solr.spelling.suggest.LookupFactory;
import org.apache.solr.spelling.suggest.fst.FSTLookupFactory;
import org.apache.solr.spelling.suggest.jaspell.JaspellLookupFactory;
import org.apache.solr.spelling.suggest.tst.TSTLookupFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;

public class DiceMultipleCaseSuggester extends SolrSpellChecker {
    private static final Logger LOG = LoggerFactory.getLogger(DiceMultipleCaseSuggester.class);

    /** Location of the source data - either a path to a file, or null for the
     * current IndexReader.
     */
    public static final String LOCATION = "sourceLocation";

    public static final String SUGGESTION_ANALYZER_FIELDTYPE = "suggestionAnalyzerFieldTypeName";

    /** Fully-qualified class of the {@link Lookup} implementation. */
    public static final String LOOKUP_IMPL = "lookupImpl";
    /**
     * Minimum frequency of terms to consider when building the dictionary.
     */
    public static final String THRESHOLD_TOKEN_FREQUENCY = "threshold";
    /**
     * Name of the location where to persist the dictionary. If this location
     * is relative then the data will be stored under the core's dataDir. If this
     * is null the storing will be disabled.
     */
    public static final String STORE_DIR = "storeDir";

    protected String sourceLocation;
    protected File storeDir;
    protected float threshold;
    protected Dictionary dictionary;
    protected IndexReader reader;
    protected Lookup lookup;
    protected String lookupImpl;
    protected SolrCore core;

    private LookupFactory factory;

    private Analyzer suggestionAnalyzer = null;
    private String suggestionAnalyzerFieldTypeName = null;

    @Override
    public String init(NamedList config, SolrCore core) {
        LOG.info("init: " + config);
        String name = super.init(config, core);
        threshold = config.get(THRESHOLD_TOKEN_FREQUENCY) == null ? 0.0f
                : (Float) config.get(THRESHOLD_TOKEN_FREQUENCY);
        sourceLocation = (String) config.get(LOCATION);
        lookupImpl = (String) config.get(LOOKUP_IMPL);

        IndexSchema schema = core.getLatestSchema();
        suggestionAnalyzerFieldTypeName = (String) config.get(SUGGESTION_ANALYZER_FIELDTYPE);
        if (schema.getFieldTypes().containsKey(suggestionAnalyzerFieldTypeName)) {
            FieldType fieldType = schema.getFieldTypes().get(suggestionAnalyzerFieldTypeName);
            suggestionAnalyzer = fieldType.getQueryAnalyzer();
        }

        // support the old classnames without -Factory for config file backwards compatibility.
        if (lookupImpl == null || "org.apache.solr.spelling.suggest.jaspell.JaspellLookup".equals(lookupImpl)) {
            lookupImpl = JaspellLookupFactory.class.getName();
        } else if ("org.apache.solr.spelling.suggest.tst.TSTLookup".equals(lookupImpl)) {
            lookupImpl = TSTLookupFactory.class.getName();
        } else if ("org.apache.solr.spelling.suggest.fst.FSTLookup".equals(lookupImpl)) {
            lookupImpl = FSTLookupFactory.class.getName();
        }

        factory = core.getResourceLoader().newInstance(lookupImpl, LookupFactory.class);

        lookup = factory.create(config, core);
        String store = (String) config.get(STORE_DIR);
        if (store != null) {
            storeDir = new File(store);
            if (!storeDir.isAbsolute()) {
                storeDir = new File(core.getDataDir() + File.separator + storeDir);
            }
            if (!storeDir.exists()) {
                storeDir.mkdirs();
            } else {
                // attempt reload of the stored lookup
                try {
                    lookup.load(new FileInputStream(new File(storeDir, factory.storeFileName())));
                } catch (IOException e) {
                    LOG.warn("Loading stored lookup data failed", e);
                }
            }
        }
        return name;
    }

    @Override
    public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException {
        LOG.info("build()");
        if (sourceLocation == null) {
            reader = searcher.getIndexReader();
            dictionary = new HighFrequencyDictionary(reader, field, threshold);
        } else {
            try {

                final String fileDelim = ",";
                if (sourceLocation.contains(fileDelim)) {
                    String[] files = sourceLocation.split(fileDelim);
                    Reader[] readers = new Reader[files.length];
                    for (int i = 0; i < files.length; i++) {
                        Reader reader = new InputStreamReader(core.getResourceLoader().openResource(files[i]),
                                IOUtils.CHARSET_UTF_8);
                        readers[i] = reader;
                    }
                    dictionary = new MultipleFileDictionary(readers);
                } else {
                    dictionary = new FileDictionary(new InputStreamReader(
                            core.getResourceLoader().openResource(sourceLocation), IOUtils.CHARSET_UTF_8));
                }
            } catch (UnsupportedEncodingException e) {
                // should not happen
                LOG.error("should not happen", e);
            }
        }

        lookup.build(dictionary);
        if (storeDir != null) {
            File target = new File(storeDir, factory.storeFileName());
            if (!lookup.store(new FileOutputStream(target))) {
                if (sourceLocation == null) {
                    assert reader != null && field != null;
                    LOG.error("Store Lookup build from index on field: " + field + " failed reader has: "
                            + reader.maxDoc() + " docs");
                } else {
                    LOG.error("Store Lookup build from sourceloaction: " + sourceLocation + " failed");
                }
            } else {
                LOG.info("Stored suggest data to: " + target.getAbsolutePath());
            }
        }
    }

    @Override
    public void reload(SolrCore core, SolrIndexSearcher searcher) throws IOException {
        LOG.info("reload()");
        if (dictionary == null && storeDir != null) {
            // this may be a firstSearcher event, try loading it
            FileInputStream is = new FileInputStream(new File(storeDir, factory.storeFileName()));
            try {
                if (lookup.load(is)) {
                    return; // loaded ok
                }
            } finally {
                IOUtils.closeWhileHandlingException(is);
            }
            LOG.debug("load failed, need to build Lookup again");
        }
        // loading was unsuccessful - build it again
        build(core, searcher);
    }

    static SpellingResult EMPTY_RESULT = new SpellingResult();

    @Override
    public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
        LOG.debug("getSuggestions: " + options.tokens);
        if (lookup == null) {
            LOG.info("Lookup is null - invoke spellchecker.build first");
            return EMPTY_RESULT;
        }

        SpellingResult res = new SpellingResult();
        for (Token currentToken : options.tokens) {
            String tokenText = currentToken.toString();

            // we need to ensure that we combine matches for different cases, and take the most common
            // where multiple case versions exist
            final Hashtable<String, LookupResult> htSuggestions = new Hashtable<String, LookupResult>();
            final Hashtable<String, Integer> htSuggestionCounts = new Hashtable<String, Integer>();

            List<Token> tokensToTry = new ArrayList<Token>();
            tokensToTry.add(currentToken);
            tokensToTry.add(newToken(currentToken, toTitleCase(tokenText)));
            tokensToTry.add(newToken(currentToken, tokenText.toLowerCase()));
            tokensToTry.add(newToken(currentToken, tokenText.toUpperCase()));

            for (Token newToken : tokensToTry) {

                if (newToken.toString().equals(tokenText) && newToken != currentToken) {
                    continue;
                }
                // if matches current token, skip
                List<LookupResult> tmpSuggestions = getLookupResults(options, newToken);
                if (tmpSuggestions != null) {
                    for (LookupResult lu : tmpSuggestions) {
                        final String key = lu.key.toString().toLowerCase();
                        LookupResult existing = htSuggestions.get(key);
                        if (existing != null) {
                            // replace if more frequent
                            if (lu.value > existing.value) {
                                htSuggestions.put(key, lu);
                            }
                            htSuggestionCounts.put(key, htSuggestionCounts.get(key) + (int) lu.value);
                        } else {
                            htSuggestions.put(key, lu);
                            htSuggestionCounts.put(key, (int) lu.value);
                        }
                    }
                }
            }

            List<String> suggestions = new ArrayList<String>(htSuggestions.keySet());
            if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) {
                Collections.sort(suggestions);
            } else {
                Collections.sort(suggestions, new Comparator<String>() {
                    public int compare(String sug1, String sug2) {
                        int sug1Count = htSuggestionCounts.get(sug1);
                        int sug2Count = htSuggestionCounts.get(sug2);
                        return sug2Count - sug1Count;
                    }
                });
            }

            for (String match : suggestions) {
                LookupResult lr = htSuggestions.get(match);
                res.add(currentToken, lr.key.toString(), (int) lr.value);
            }

        }
        return res;
    }

    private Token newToken(Token existing, String newText) {
        return new Token(newText, existing.startOffset(), existing.endOffset(), existing.type());
    }

    private String toTitleCase(String s) {
        if (s.length() == 0) {
            return s;
        }
        if (s.length() == 1) {
            return s.toUpperCase();
        }
        return String.valueOf(s.charAt(0)).toUpperCase() + s.substring(1).toLowerCase();
    }

    private List<LookupResult> getLookupResults(SpellingOptions options, Token currentToken) {
        CharsRef scratch = new CharsRef();
        scratch.chars = currentToken.buffer();
        scratch.offset = 0;
        scratch.length = currentToken.length();
        boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR)
                && !(lookup instanceof WFSTCompletionLookup) && !(lookup instanceof AnalyzingSuggester);

        List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count);
        if (suggestions == null || suggestions.size() == 0) {
            return null;
        }

        return suggestions;
    }

    private boolean isStringNullOrEmpty(String s) {
        return s == null || s.length() == 0;
    }

    private String getAnalyzerResult(String suggestion) {
        TokenStream ts = null;
        try {
            Reader reader = new StringReader(suggestion);
            ts = this.suggestionAnalyzer.tokenStream("", reader);

            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                String word = termAtt.toString();
                if (word != null && word.length() > 0) {
                    return word;
                }
            }
        } catch (Exception ex) {
            if (this.field != null) {
                LOG.error(
                        String.format("Error executing analyzer for field: {0} in DiceSuggester on suggestion: {1}",
                                this.field, suggestion),
                        ex);
            } else if (this.fieldTypeName != null) {
                LOG.error(String.format(
                        "Error executing analyzer for field type: {0} in DiceSuggester on suggestion: {1}",
                        this.fieldTypeName, suggestion), ex);
            }
        } finally {
            if (ts != null) {
                IOUtils.closeWhileHandlingException(ts);
            }
        }
        return null;
    }
}