com.searchbox.SuggesterComponent.java Source code

Introduction

Here is the source code for com.searchbox.SuggesterComponent.java
Source

/*******************************************************************************
 * Copyright Searchbox - http://www.searchbox.com
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.searchbox;

import com.searchbox.SuggestionResultSet.SuggestionResult;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrEventListener;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 
 * @author andrew
 */
public class SuggesterComponent extends SearchComponent implements SolrCoreAware, SolrEventListener {

    private static Logger LOGGER = LoggerFactory.getLogger(SuggesterComponent.class);
    protected NamedList initParams;
    protected File storeDir;
    protected String storeDirname;
    protected Boolean buildOnOptimize = false;
    protected Boolean buildOnCommit = false;
    protected Integer ngrams;
    protected Integer minDocFreq;
    protected Integer minTermFreq;
    protected Integer maxNumDocs;
    protected String nonpruneFileName;
    protected ResourceLoader resouceloader;
    protected String stopWordFile;
    volatile long numRequests;
    volatile long numErrors;
    volatile long totalBuildTime;
    volatile long totalRequestsTime;
    volatile String lastbuildDate;

    SuggesterTreeHolder suggester;
    protected String gfields[];
    private List<String> stopwords = new ArrayList<String>();

    @Override
    // standard loading of options from config file, discussed in documentation
    public void init(NamedList args) {
        LOGGER.debug(("Hit init"));

        super.init(args);
        this.initParams = args;

        buildOnOptimize = Boolean.parseBoolean((String) args.get(SuggesterComponentParams.BUILD_ON_OPTIMIZE));
        if (buildOnOptimize == null) {
            buildOnOptimize = Boolean.parseBoolean(SuggesterComponentParams.BUILD_ON_OPTIMIZE_DEFAULT);
        }

        buildOnCommit = Boolean.parseBoolean((String) args.get(SuggesterComponentParams.BUILD_ON_COMMIT));
        if (buildOnCommit == null) {
            buildOnCommit = Boolean.parseBoolean(SuggesterComponentParams.BUILD_ON_COMMIT_DEFAULT);
        }

        storeDirname = (String) args.get(SuggesterComponentParams.STOREDIR);
        if (storeDirname == null) {
            storeDirname = SuggesterComponentParams.STOREDIR_DEFAULT;
        }

        stopWordFile = (String) args.get(SuggesterComponentParams.STOP_WORD_LOCATION);
        if (stopWordFile == null) {
            stopWordFile = SuggesterComponentParams.STOP_WORD_LOCATION_DEFAULT;
        }

        nonpruneFileName = (String) args.get(SuggesterComponentParams.NONPRUNEFILE);

        ngrams = (Integer) args.get(SuggesterComponentParams.NGRAMS);
        if (ngrams == null) {
            ngrams = Integer.parseInt(SuggesterComponentParams.NGRAMS_DEFAULT);
        }

        minDocFreq = (Integer) args.get(SuggesterComponentParams.MINDOCFREQ);
        if (minDocFreq == null) {
            minDocFreq = SuggesterComponentParams.MINDOCFREQ_DEFAULT;
        }

        minTermFreq = (Integer) args.get(SuggesterComponentParams.MINTERMFREQ);
        if (minTermFreq == null) {
            minTermFreq = SuggesterComponentParams.MINTERMFREQ_DEFAULT;
        }

        maxNumDocs = (Integer) args.get(SuggesterComponentParams.MAXNUMDOCS);
        if (maxNumDocs == null) {
            maxNumDocs = SuggesterComponentParams.MAXNUMDOCS_DEFAULT;
        }

        NamedList fields = ((NamedList) args.get(SuggesterComponentParams.FIELDS));
        if (fields == null) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Need to specify at least one field");
        }

        gfields = (String[]) fields.getAll(SuggesterComponentParams.FIELD).toArray(new String[0]);
        if (gfields == null) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Need to specify at least one field");
        }

        LOGGER.debug("maxNumDocs is " + maxNumDocs);
        LOGGER.debug("minDocFreq is " + minDocFreq);
        LOGGER.debug("minTermFreq is " + minTermFreq);
        LOGGER.debug("buildOnCommit is " + buildOnCommit);
        LOGGER.debug("buildOnOptimize is " + buildOnOptimize);
        LOGGER.debug("storeDirname is " + storeDirname);
        LOGGER.debug("Ngrams is " + ngrams);
        LOGGER.debug("Fields is " + gfields);
        LOGGER.debug("Nonprune file is " + nonpruneFileName);

    }

    @Override
    public void prepare(ResponseBuilder rb) throws IOException {
        // none necessary
    }

    @Override
    // actually do the request
    public void process(ResponseBuilder rb) throws IOException {
        LOGGER.trace(("Hit process"));
        SolrParams params = rb.req.getParams();
        // see what fields we should be using for query
        String[] fields = params.getParams(SuggesterComponentParams.FIELDS + "." + SuggesterComponentParams.FIELD);

        if (fields == null) {

            fields = gfields;
        } else {
            for (String field : fields) {
                LOGGER.info("Using overrode fields:" + field);

            }
        }
        boolean build = params.getBool(SuggesterComponentParams.PRODUCT_NAME + "." + SuggesterComponentParams.BUILD,
                false);
        SolrIndexSearcher searcher = rb.req.getSearcher();
        // request has requested rebuilding of the dictionary
        if (build) {
            long lstartTime = System.currentTimeMillis();
            buildAndWrite(searcher, fields);
            totalBuildTime += System.currentTimeMillis() - lstartTime;
            lastbuildDate = new Date().toString();
        }

        if (suggester == null) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
                    "Model for SBsuggester not created, create using sbsuggester.build=true");
        }
        String query = params.get(SuggesterComponentParams.PRODUCT_NAME + "." + SuggesterComponentParams.QUERY,
                params.get(CommonParams.Q));
        LOGGER.debug("Query:\t" + query);
        if (query == null) {
            LOGGER.warn("No query, returning..maybe was just used for  building index?");
            numErrors++;
            return;
        }

        long lstartTime = System.currentTimeMillis();
        numRequests++;

        // maximum number of phrases to look though
        int maxPhraseSearch = params.getInt(
                SuggesterComponentParams.PRODUCT_NAME + "." + SuggesterComponentParams.MAX_PHRASE_SEARCH,
                SuggesterComponentParams.MAX_PHRASE_SEARCH_DEFAULT);
        LOGGER.debug("maxPhraseSearch:\t" + maxPhraseSearch);
        SuggestionResultSet suggestions = suggester.getSuggestions(searcher, fields, query, maxPhraseSearch); // actually find suggestions

        Integer numneeded = params.getInt(
                SuggesterComponentParams.PRODUCT_NAME + "." + SuggesterComponentParams.COUNT,
                SuggesterComponentParams.COUNT_DEFAULT);

        NamedList response = new SimpleOrderedMap();

        int numout = 0;
        // stick results in an response object
        for (SuggestionResult suggestion : suggestions.suggestions) {
            LOGGER.debug(suggestion.suggestion + "\t" + suggestion.probability);
            response.add(suggestions.myval + suggestion.suggestion, suggestion.probability);
            numout++;
            if (numout > numneeded) {
                break;
            }
        }
        LOGGER.debug("\n\n");

        rb.rsp.add(SuggesterComponentParams.PRODUCT_NAME, response);
        totalRequestsTime += System.currentTimeMillis() - lstartTime;
    }

    @Override
    // run on loadup of solr
    public void inform(SolrCore core) {
        LOGGER.trace(("Hit inform"));
        // pull in stop words which will be used later
        loadStopWords(core.getResourceLoader());
        if (storeDirname != null) {
            storeDir = new File(storeDirname);
            if (!storeDir.isAbsolute()) {
                storeDir = new File(core.getDataDir() + File.separator + storeDir);
            }
            if (!storeDir.exists()) {
                LOGGER.warn("Directory " + storeDir.getAbsolutePath()
                        + " doesn't exist for re-load of suggester, creating emtpy "
                        + "directory, make sure to use suggester.build before first use!");
                storeDir.mkdirs();
            } else {
                try {
                    // load premade dictionary object
                    readFile(storeDir);
                } catch (Exception ex) {
                    LOGGER.error("Error loading sbsuggester model");
                }
            }
        }

        // check to see if the new searcher should trigger a build on optimize
        // or commit
        if (buildOnCommit || buildOnOptimize) {
            LOGGER.info("Registering newSearcher listener for Searchbox Suggester: ");
            core.registerNewSearcherListener(this);
        }
    }

    @Override
    public String getDescription() {
        return "Searchbox Suggester";
    }

    @Override
    public String getVersion() {
        return "1.0";
    }

    @Override
    public String getSource() {
        return "http://www.searchbox.com";
    }

    @Override
    public NamedList<Object> getStatistics() {

        NamedList all = new SimpleOrderedMap<Object>();
        all.add("requests", "" + numRequests);
        all.add("errors", "" + numErrors);
        all.add("totalBuildTime(ms)", "" + totalBuildTime);
        all.add("totalRequestTime(ms)", "" + totalRequestsTime);
        if (lastbuildDate == null) {
            lastbuildDate = "N/A";
        }
        all.add("lastBuildDate", lastbuildDate);

        return all;
    }

    public void postCommit() {
        LOGGER.trace("postCommit hit");

    }

    public void postSoftCommit() {
        LOGGER.trace("postSoftCommit hit");

    }

    /**
     * new searcher event used to create suggestion model if config flags
     * areappropriately set
     */
    public void newSearcher(SolrIndexSearcher newSearcher, SolrIndexSearcher currentSearcher) {
        LOGGER.trace("newSearcher hit");
        if (currentSearcher == null) {
            // firstSearcher event
            try {
                LOGGER.info("Loading suggester model.");
                readFile(storeDir);

            } catch (Exception e) {
                LOGGER.error("Exception in reloading suggester model.");
            }
        } else {
            // newSearcher event
            if (buildOnCommit) {
                buildAndWrite(newSearcher, gfields);
            } else if (buildOnOptimize) {
                if (newSearcher.getIndexReader().leaves().size() == 1) {
                    buildAndWrite(newSearcher, gfields);
                } else {
                    LOGGER.info("Index is not optimized therefore skipping " + "building suggester index");
                }
            }
        }
    }

    public void writeFile(File dir) {
        LOGGER.info("Writing suggester model to file");
        try {
            FileOutputStream fos = new FileOutputStream(dir + File.separator + "suggester.ser");
            BufferedOutputStream bos = new BufferedOutputStream(fos);
            ObjectOutputStream oos = new ObjectOutputStream(bos);
            oos.writeObject(suggester);
            oos.flush();
            oos.close();
        } catch (Exception e) {
            LOGGER.error("There was a problem with saving model to disk. Suggester "
                    + "will still work because model is in memory." + e.getMessage());
        }
        LOGGER.info("Done writing suggester model to file");
    }

    private void readFile(File dir) {
        LOGGER.info("Reading object from file");
        try {
            FileInputStream fis = new FileInputStream(dir + File.separator + "suggester.ser");
            BufferedInputStream bis = new BufferedInputStream(fis);
            ObjectInputStream ois = new ObjectInputStream(bis);
            suggester = (SuggesterTreeHolder) ois.readObject();
            ois.close();
        } catch (Exception e) {
            LOGGER.error("There was a problem with load model from disk. Suggester "
                    + "will not work unless build=true option is passed. " + "Stack Message: " + e.getMessage());
        }
        LOGGER.info("Done reading object from file");
    }

    private void buildAndWrite(SolrIndexSearcher searcher, String[] fields) {
        LOGGER.info("Building suggester model");
        SuggeterDataStructureBuilder sdsb = new SuggeterDataStructureBuilder(searcher, fields, ngrams, minDocFreq,
                minTermFreq, maxNumDocs, nonpruneFileName, stopwords);
        suggester = sdsb.getSuggester();
        sdsb = null;
        writeFile(storeDir);
        LOGGER.info("Done building and storing suggester model");
    }

    public void loadStopWords(ResourceLoader rl) {
        BufferedReader in = null;
        try {
            LOGGER.info("Trying to use custom stopwords:\t" + stopWordFile);
            stopwords = getLines(rl, stopWordFile.trim());
            return;
        } catch (Exception ex) {
            LOGGER.info("Error using custom stopwords");
        }

        try {
            LOGGER.info("Using Builtin stopwords (english default)");
            in = new BufferedReader(new InputStreamReader((getClass().getResourceAsStream(stopWordFile))));
            String line;
            while ((line = in.readLine()) != null) {
                stopwords.add(line.trim().toLowerCase());
            }
            in.close();
        } catch (Exception ex) {
            LOGGER.error("Error loading stopwords: " + ex.getMessage());
        }
    }

    protected final List<String> getLines(ResourceLoader loader, String resource) throws IOException {
        return WordlistLoader.getLines(loader.openResource(resource), IOUtils.CHARSET_UTF_8);
    }
}