org.jamwiki.search.LuceneSearchEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.jamwiki.search.LuceneSearchEngine.java

Source

/**
 * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the latest version of the GNU Lesser General
 * Public License as published by the Free Software Foundation;
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program (LICENSE.txt); if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package org.jamwiki.search;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.SimpleFSLockFactory;
import org.apache.lucene.util.Version;
import org.apache.lucene.store.LockObtainFailedException;
import org.jamwiki.Environment;
import org.jamwiki.SearchEngine;
import org.jamwiki.WikiBase;
import org.jamwiki.model.SearchResultEntry;
import org.jamwiki.model.Topic;
import org.jamwiki.model.TopicType;
import org.jamwiki.model.VirtualWiki;
import org.jamwiki.utils.WikiLogger;

/**
 * An implementation of {@link org.jamwiki.SearchEngine} that uses
 * <a href="http://lucene.apache.org/java/">Lucene</a> to perform searches of
 * Wiki content.
 */
public class LuceneSearchEngine implements SearchEngine {

    /** Where to log to */
    private static final WikiLogger logger = WikiLogger.getLogger(LuceneSearchEngine.class.getName());
    /** Directory for search index files */
    private static final String SEARCH_DIR = "search";
    /** Name of the search index field that holds the processed topic content. */
    private static final String FIELD_TOPIC_CONTENT = "topic_content";
    /** Name of the search index field that holds the un-processed topic name. */
    protected static final String FIELD_TOPIC_NAME = "topic_name";
    /** Name of the search index field that holds the processed topic name. */
    private static final String FIELD_TOPIC_NAME_ANALYZED = "topic_name_analyzed";
    /** Name of the search index field that holds the un-processed topic namespace. */
    private static final String FIELD_TOPIC_NAMESPACE = "topic_namespace";
    /** Lucene compatibility version. */
    protected static final Version USE_LUCENE_VERSION = Version.LUCENE_41;
    /** Maximum number of results to return per search. */
    // FIXME - make this configurable
    protected static final int MAXIMUM_RESULTS_PER_SEARCH = 200;
    /** Flag indicating whether or not to commit search index changes immediately. */
    private boolean autoCommit = true;
    /** Flag indicating whether write operations are temporarily disabled. */
    private boolean disabled = false;
    /** Store Searchers (once opened) for re-use for performance reasons. */
    private Map<String, IndexSearcher> searchers = new HashMap<String, IndexSearcher>();
    /** Store Readers (once opened) for re-use for performance reasons. */
    private Map<String, IndexReader> indexReaders = new HashMap<String, IndexReader>();
    /** Store Writers (once opened) for re-use for performance reasons. */
    private Map<String, IndexWriter> indexWriters = new HashMap<String, IndexWriter>();

    /**
     * Add a topic to the search index.
     *
     * @param topic The Topic object that is to be added to the index.
     */
    public void addToIndex(Topic topic) {
        if (this.disabled) {
            return;
        }
        try {
            long start = System.currentTimeMillis();
            IndexWriter writer = this.retrieveIndexWriter(topic.getVirtualWiki(), false);
            this.addToIndex(writer, topic);
            this.commit(writer, this.autoCommit);
            if (logger.isDebugEnabled()) {
                logger.debug("Add to search index for topic " + topic.getVirtualWiki() + " / " + topic.getName()
                        + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s.");
            }
        } catch (Exception e) {
            logger.error("Exception while adding topic " + topic.getVirtualWiki() + " / " + topic.getName(), e);
        }
    }

    /**
     * Add a topic to the search index.
     *
     * @param writer The IndexWriter to use when updating the search index.
     * @param topic The Topic object that is to be added to the index.
     */
    private void addToIndex(IndexWriter writer, Topic topic) throws IOException {
        if (topic.getTopicType() == TopicType.REDIRECT) {
            // do not index redirects
            return;
        }
        Document standardDocument = createStandardDocument(topic);
        writer.addDocument(standardDocument);
        this.resetIndexSearcher(topic.getVirtualWiki());
    }

    /**
     * Force a flush of any pending commits to the search index.
     *
     * @param virtualWiki The virtual wiki for which pending updates are being
     *  committed.
     */
    public void commit(String virtualWiki) {
        try {
            this.commit(this.retrieveIndexWriter(virtualWiki, false), true);
        } catch (IOException e) {
            logger.error("Exception while committing pending changes for virtual wiki " + virtualWiki, e);
        }
    }

    /**
     * Commit pending changes to the writer only if the commitNow value is true.
     * This is primarily a utility method for working with the autoCommit flag.
     */
    private void commit(IndexWriter writer, boolean commitNow) throws IOException {
        if (commitNow) {
            writer.commit();
        }
    }

    /**
     * Given the search text, searcher object, and query analyzer generate an
     * appropriate Lucene search query.
     */
    protected Query createSearchQuery(IndexSearcher searcher, StandardAnalyzer analyzer, String text,
            List<Integer> namespaces) throws IOException, ParseException {
        BooleanQuery fullQuery = new BooleanQuery();
        QueryParser qp;
        // build the namespace portion the query
        if (namespaces != null && !namespaces.isEmpty()) {
            qp = new QueryParser(USE_LUCENE_VERSION, FIELD_TOPIC_NAMESPACE, analyzer);
            StringBuilder namespaceText = new StringBuilder();
            for (Integer namespaceId : namespaces) {
                if (namespaceText.length() != 0) {
                    namespaceText.append(" ").append(QueryParser.Operator.OR).append(" ");
                }
                namespaceText.append(namespaceId);
            }
            fullQuery.add(qp.parse(namespaceText.toString()), Occur.MUST);
        }
        // create a sub-query for topic name & topic text
        BooleanQuery nameAndContentQuery = new BooleanQuery();
        // topic name
        qp = new QueryParser(USE_LUCENE_VERSION, FIELD_TOPIC_NAME_ANALYZED, analyzer);
        nameAndContentQuery.add(qp.parse(text), Occur.SHOULD);
        // topic content
        qp = new QueryParser(USE_LUCENE_VERSION, FIELD_TOPIC_CONTENT, analyzer);
        nameAndContentQuery.add(qp.parse(text), Occur.SHOULD);
        // rewrite the sub-query to expand it - required for wildcards to work with highlighter
        Query subQuery = searcher.rewrite(nameAndContentQuery);
        // add the sub-query to the main query
        fullQuery.add(subQuery, Occur.MUST);
        return fullQuery;
    }

    /**
     * Create a basic Lucene document to add to the index.  This document
     * is suitable to be parsed with the StandardAnalyzer.
     */
    private Document createStandardDocument(Topic topic) {
        String topicContent = topic.getTopicContent();
        if (topicContent == null) {
            topicContent = "";
        }
        Document doc = new Document();
        // store the (not analyzed) topic name to use when deleting records from the index.
        doc.add(new StringField(FIELD_TOPIC_NAME, topic.getName(), Field.Store.YES));
        // add the topic namespace (not analyzed) topic namespace to allow retrieval by namespace.
        // this field is used internally in searches.
        doc.add(new StringField(FIELD_TOPIC_NAMESPACE, topic.getNamespace().getId().toString(), Field.Store.NO));
        // analyze the topic name so that (for example) a search for "New York" will match "New York City"
        TextField nameField = new TextField(FIELD_TOPIC_NAME_ANALYZED, topic.getName(), Field.Store.NO);
        // make the topic name worth 3x as much as topic content in searches
        nameField.setBoost(3.0f);
        doc.add(nameField);
        // analyze & store the topic content so that it is searchable and also usable for display in
        // search result summaries
        doc.add(new TextField(FIELD_TOPIC_CONTENT, topicContent, Field.Store.YES));
        return doc;
    }

    /**
     * Remove a topic from the search index.
     *
     * @param topic The topic object that is to be removed from the index.
     */
    public void deleteFromIndex(Topic topic) {
        if (this.disabled) {
            return;
        }
        try {
            long start = System.currentTimeMillis();
            // delete the current document
            IndexWriter writer = this.retrieveIndexWriter(topic.getVirtualWiki(), false);
            this.deleteFromIndex(writer, topic);
            this.commit(writer, this.autoCommit);
            if (logger.isDebugEnabled()) {
                logger.debug("Delete from search index for topic " + topic.getVirtualWiki() + " / "
                        + topic.getName() + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s.");
            }
        } catch (Exception e) {
            logger.error("Exception while adding topic " + topic.getVirtualWiki() + ':' + topic.getName(), e);
        }
    }

    /**
     * Remove a topic from the search index.
     *
     * @param writer The IndexWriter to use when updating the search index.
     * @param topic The topic object that is to be removed from the index.
     */
    private void deleteFromIndex(IndexWriter writer, Topic topic) throws IOException {
        writer.deleteDocuments(new Term(FIELD_TOPIC_NAME, topic.getName()));
        this.resetIndexSearcher(topic.getVirtualWiki());
    }

    /**
     * Find all documents that contain a specific search term, ordered by relevance.
     * This method supports all Lucene search query syntax.
     *
     * @param virtualWiki The virtual wiki for the topic.
     * @param text The search term being searched for.
     * @return A list of SearchResultEntry objects for all documents that
     *  contain the search term.
     */
    public List<SearchResultEntry> findResults(String virtualWiki, String text, List<Integer> namespaces) {
        StandardAnalyzer analyzer = new StandardAnalyzer(USE_LUCENE_VERSION);
        List<SearchResultEntry> results = new ArrayList<SearchResultEntry>();
        if (logger.isTraceEnabled()) {
            logger.trace("search text: " + text);
        }
        try {
            IndexSearcher searcher = this.retrieveIndexSearcher(virtualWiki);
            Query query = this.createSearchQuery(searcher, analyzer, text, namespaces);
            // actually perform the search
            TopScoreDocCollector collector = TopScoreDocCollector.create(MAXIMUM_RESULTS_PER_SEARCH, true);
            searcher.search(query, collector);
            Highlighter highlighter = new Highlighter(
                    new SimpleHTMLFormatter("<span class=\"highlight\">", "</span>"), new SimpleHTMLEncoder(),
                    new QueryScorer(query, FIELD_TOPIC_CONTENT));
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            for (int i = 0; i < hits.length; i++) {
                int docId = hits[i].doc;
                Document doc = searcher.doc(docId);
                String summary = retrieveResultSummary(doc, highlighter, analyzer);
                SearchResultEntry result = new SearchResultEntry(doc.get(FIELD_TOPIC_NAME), hits[i].score, summary);
                results.add(result);
            }
        } catch (Exception e) {
            logger.error("Exception while searching for " + text, e);
        }
        return results;
    }

    /**
     * Get the path, which holds all index files
     */
    private File getSearchIndexPath(String virtualWiki) throws IOException {
        File parent = new File(Environment.getValue(Environment.PROP_BASE_FILE_DIR), SEARCH_DIR);
        try {
            if (System.getProperty("org.apache.lucene.lockdir") == null) {
                // set the Lucene lock directory.  this defaults to java.io.tmpdir,
                // which may not be writable on some systems.
                System.setProperty("org.apache.lucene.lockdir", parent.getPath());
            }
        } catch (Exception e) {
            // probably a security exception
            logger.warn("Unable to specify Lucene lock directory, default will be used: " + e.getMessage());
        }
        File child = new File(parent.getPath(), "index" + virtualWiki + File.separator);
        if (!child.exists()) {
            // create the search instance
            child.mkdirs();
            IndexWriter writer = this.openIndexWriter(child, true);
            writer.close();
        }
        return child;
    }

    /**
     * Open an IndexReader, executing error handling as needed.
     */
    private IndexReader openIndexReader(File searchIndexPath) throws IOException {
        return DirectoryReader.open(FSDirectory.open(searchIndexPath));
    }

    /**
     * Open an IndexWriter, executing error handling as needed.
     */
    private IndexWriter openIndexWriter(File searchIndexPath, boolean create) throws IOException {
        // NFS doesn't work with Lucene default locking as of Lucene 3.3, so use
        // SimpleFSLockFactory instead.
        LockFactory lockFactory = new SimpleFSLockFactory();
        FSDirectory fsDirectory = FSDirectory.open(searchIndexPath, lockFactory);
        IndexWriter indexWriter = null;
        try {
            indexWriter = new IndexWriter(fsDirectory, this.retrieveIndexWriterConfig(create));
        } catch (LockObtainFailedException e) {
            logger.warn("Unable to obtain lock for " + searchIndexPath.getAbsolutePath()
                    + ".  Attempting to forcibly unlock the index.");
            if (IndexWriter.isLocked(fsDirectory)) {
                try {
                    IndexWriter.unlock(fsDirectory);
                    logger.info("Successfully unlocked search directory " + searchIndexPath.getAbsolutePath());
                } catch (IOException ex) {
                    logger.warn("Unable to unlock search directory " + searchIndexPath.getAbsolutePath() + " "
                            + ex.toString());
                }
            }
        }
        if (indexWriter == null) {
            // try again, there could have been a stale lock
            indexWriter = new IndexWriter(fsDirectory, this.retrieveIndexWriterConfig(create));
        }
        return indexWriter;
    }

    /**
     * Refresh the current search index by re-visiting all topic pages.
     *
     * @throws Exception Thrown if any error occurs while re-indexing the Wiki.
     */
    public void refreshIndex() throws Exception {
        List<VirtualWiki> allWikis = WikiBase.getDataHandler().getVirtualWikiList();
        Topic topic;
        for (VirtualWiki virtualWiki : allWikis) {
            long start = System.currentTimeMillis();
            int count = 0;
            IndexWriter writer = null;
            try {
                writer = this.retrieveIndexWriter(virtualWiki.getName(), true);
                List<String> topicNames = WikiBase.getDataHandler().getAllTopicNames(virtualWiki.getName(), false);
                // FIXME - parsing all documents will be intolerably slow with even a
                // moderately large Wiki
                for (String topicName : topicNames) {
                    topic = WikiBase.getDataHandler().lookupTopic(virtualWiki.getName(), topicName, false);
                    if (topic == null) {
                        logger.info("Unable to rebuild search index for topic: " + topicName);
                        continue;
                    }
                    // note: no delete is necessary since a new index is being created
                    this.addToIndex(writer, topic);
                    count++;
                }
            } catch (Exception ex) {
                logger.error("Failure while refreshing search index", ex);
            } finally {
                try {
                    if (writer != null) {
                        writer.close();
                    }
                } catch (Exception e) {
                    logger.error("Exception during close", e);
                }
            }
            if (logger.isInfoEnabled()) {
                logger.info("Rebuilt search index for " + virtualWiki.getName() + " (" + count + " documents) in "
                        + ((System.currentTimeMillis() - start) / 1000.000) + " seconds");
            }
        }
    }

    /**
     * Call this method after a search index is updated to reset the searcher.
     */
    private void resetIndexSearcher(String virtualWiki) throws IOException {
        IndexSearcher searcher = searchers.get(virtualWiki);
        if (searcher != null) {
            searchers.remove(virtualWiki);
        }
    }

    /**
     * For performance reasons cache the IndexSearcher for re-use.
     */
    protected IndexSearcher retrieveIndexSearcher(String virtualWiki) throws IOException {
        IndexSearcher searcher = searchers.get(virtualWiki);
        if (searcher == null) {
            searcher = new IndexSearcher(this.retrieveIndexReader(virtualWiki));
            searchers.put(virtualWiki, searcher);
        }
        return searcher;
    }

    /**
     * For performance reasons create a cache of readers.
     */
    private IndexReader retrieveIndexReader(String virtualWiki) throws IOException {
        IndexReader indexReader = indexReaders.get(virtualWiki);
        if (indexReader == null) {
            File searchIndexPath = this.getSearchIndexPath(virtualWiki);
            indexReader = this.openIndexReader(searchIndexPath);
            indexReaders.put(virtualWiki, indexReader);
        }
        return indexReader;
    }

    /**
     * For performance reasons create a cache of writers.  Since writers are not being
     * re-initialized then commit() must be called to explicitly flush data to the index,
     * otherwise it will be flushed on a programmatic basis by Lucene.
     */
    private IndexWriter retrieveIndexWriter(String virtualWiki, boolean create) throws IOException {
        IndexWriter indexWriter = indexWriters.get(virtualWiki);
        if (create && indexWriter != null) {
            // if the writer is going to blow away the existing index and create a new one then it
            // should not be cached.  instead, close any open writer, create a new one, and return.
            indexWriter.close();
            indexWriters.remove(virtualWiki);
            indexWriter = null;
        }
        if (indexWriter == null) {
            File searchIndexPath = this.getSearchIndexPath(virtualWiki);
            indexWriter = this.openIndexWriter(searchIndexPath, create);
            if (!create) {
                indexWriters.put(virtualWiki, indexWriter);
            }
        }
        return indexWriter;
    }

    /**
     * Retrieve an IndexWriter configuration object.
     */
    private IndexWriterConfig retrieveIndexWriterConfig(boolean create) {
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(USE_LUCENE_VERSION,
                new StandardAnalyzer(USE_LUCENE_VERSION));
        indexWriterConfig.setOpenMode(
                ((create) ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.CREATE_OR_APPEND));
        return indexWriterConfig;
    }

    /**
     *
     */
    protected String retrieveResultSummary(Document document, Highlighter highlighter, StandardAnalyzer analyzer)
            throws InvalidTokenOffsetsException, IOException {
        String content = document.get(FIELD_TOPIC_CONTENT);
        TokenStream tokenStream = analyzer.tokenStream(FIELD_TOPIC_CONTENT, new StringReader(content));
        String summary = highlighter.getBestFragments(tokenStream, content, 3, "...");
        if (StringUtils.isBlank(summary) && !StringUtils.isBlank(content)) {
            summary = StringEscapeUtils.escapeHtml4(content.substring(0, Math.min(200, content.length())));
            if (Math.min(200, content.length()) == 200) {
                summary += "...";
            }
        }
        return summary;
    }

    /**
     *
     */
    public void setAutoCommit(boolean autoCommit) {
        this.autoCommit = autoCommit;
    }

    /**
     *
     */
    public void setDisabled(boolean disabled) {
        this.disabled = disabled;
    }

    /**
     * 
     */
    public void shutdown() throws IOException {
        for (IndexReader indexReader : this.indexReaders.values()) {
            indexReader.close();
        }
        for (IndexWriter indexWriter : this.indexWriters.values()) {
            indexWriter.close();
        }
    }

    /**
     *
     */
    public void updateInIndex(Topic topic) {
        if (this.disabled) {
            return;
        }
        try {
            long start = System.currentTimeMillis();
            IndexWriter writer = this.retrieveIndexWriter(topic.getVirtualWiki(), false);
            this.deleteFromIndex(writer, topic);
            this.addToIndex(writer, topic);
            this.commit(writer, this.autoCommit);
            if (logger.isDebugEnabled()) {
                logger.debug("Update search index for topic " + topic.getVirtualWiki() + " / " + topic.getName()
                        + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s.");
            }
        } catch (Exception e) {
            logger.error("Exception while updating topic " + topic.getVirtualWiki() + " / " + topic.getName(), e);
        }
    }
}