com.gnizr.core.search.SearchIndexManager.java Source code

Introduction

Here is the source code for com.gnizr.core.search.SearchIndexManager.java
Source

/*
 * gnizr is a trademark of Image Matters LLC in the United States.
 * 
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either expressed or implied. See the License
 * for the specific language governing rights and limitations under the License.
 * 
 * The Initial Contributor of the Original Code is Image Matters LLC.
 * Portions created by the Initial Contributor are Copyright (C) 2007
 * Image Matters LLC. All Rights Reserved.
 */
package com.gnizr.core.search;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.FieldSelectorResult;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.store.LockObtainFailedException;

/**
 * Creates and manages the search index of bookmarks. Search index is created
 * using Lucene API. The implementation for changing search index (i.e., add,
 * delete or update a bookmark index) is executed in the separate internal
 * thread.
 * 
 * @author Harry Chen
 * @since 2.4.0
 * 
 */
public class SearchIndexManager implements Serializable {

    /**
     * 
     */
    private static final long serialVersionUID = 8598646761200259815L;

    private static final Logger logger = Logger.getLogger(SearchIndexManager.class);

    private SearchIndexProfile profile;

    private LinkedBlockingQueue<Request> documentQueue;

    private Thread workerThread;

    private UpdateIndexWorker worker;

    // used to signal workerThread that this class instance is shutting down.
    private static final Document POISON_DOC = new Document();

    private boolean forceIndexReset;

    private File indexDirectory;

    private static final String INDEX_DIR = "bmarks-idx";

    /**
     * Creates an instance of this class. If a search index database exists, use
     * it as it.
     */
    public SearchIndexManager() {
        this.forceIndexReset = false;
    }

    /**
     * Creates an instance of this class and optionally defines whether or not
     * to force an existing search index database should be cleared.
     * 
     * @param resetIndex
     *            <code>true</code> if an existing search index database
     *            should be cleared upon initialization.
     */
    public SearchIndexManager(boolean resetIndex) {
        this.forceIndexReset = resetIndex;
    }

    /**
     * Initializes this class instance and outputs errors in the log file if the
     * <code>profile</code> is <code>null</code>. Forces the index database
     * to be cleared if the <code>resetIndex</code> is set to
     * <code>true</code> in the constructor. Starts the internal thread for
     * managing the search index database.
     */
    public void init() {
        if (profile == null) {
            logger.error("Can't initialize search index database -- profile undefined.");
            // quit. as soon as
            return;
        }

        if (profile.getSearchIndexDirectory() == null) {
            logger.error("Undefined search index database directory: null");
            throw new NullPointerException("Search index directory is undefined in the configuration file.");
        } else {
            File dir = new File(profile.getSearchIndexDirectory());
            if (dir.exists() && dir.isDirectory() == false) {
                logger.error("Defined path is not a directory. Path: " + dir.toString());
                throw new RuntimeException("Defined search index directory is not a system directory.");
            }
            indexDirectory = new File(profile.getSearchIndexDirectory(), INDEX_DIR);
        }

        if (forceIndexReset == true) {
            logger.info("Overwriting the existing index store, if it exists.");
            createEmptyIndexDirectory();
        }

        documentQueue = new LinkedBlockingQueue<Request>();
        worker = new UpdateIndexWorker();
        workerThread = new Thread(worker);
        workerThread.setDaemon(true);
        workerThread.start();
    }

    private void createEmptyIndexDirectory() {
        IndexWriter writer = null;
        try {
            writer = new IndexWriter(indexDirectory, DocumentCreator.createDocumentAnalyzer(), true);
        } catch (Exception e) {
            logger.error("Unable to reset the search index. Path " + indexDirectory.toString(), e);
        } finally {
            if (writer != null) {
                try {
                    writer.optimize();
                    writer.close();
                } catch (Exception e) {
                    logger.error(e);
                }
            }
        }
    }

    /**
     * Checks whether the internal index thread is performing any active
     * indexing. This is an estimated result.
     * 
     * @return <code>true</code> if the the internal thread is performing
     *         indexing. Returns <code>false</code> otherwise.
     */
    public boolean isIndexProcessActive() {
        if (getIndexProcessWorkLoad() > 0) {
            return true;
        }
        return false;
    }

    /**
     * Gets an estimated count of the workload of the internal index thread. 
     * This number is the sum of <code>getIndexProcessPending</code> and 
     * <code>getIndexProcessWorking</code>. Use
     * with caution. This number is only an estimate.
     * 
     * @return workload count
     */
    public int getIndexProcessWorkLoad() {
        int numWorking = worker.getWorkQueueSize();
        int numPending = documentQueue.size();
        return numPending + numWorking;
    }

    /**
     * Gets an estimated count of the amount of work that is 
     * pending to be processed. Use with caution. This number 
     * is only an estimate. 
     * @return pending work count.
     */
    public int getIndexProcessPending() {
        return documentQueue.size();
    }

    /**
     * Gets an estimated count of the amount of work 
     * that is currently being processed (i.e., not pending). 
     * Use with caution. This number is only an estimate.
     * @return amount of work being processed.
     */
    public int getIndexProcessWorking() {
        return worker.getWorkQueueSize();
    }

    /**
     * Checks whether the thread that is responsible for adding, deleting and
     * updating bookmark index is still alive.
     * 
     * @return <code>true</code> if the worker thread that performs index
     *         modification is still alive. Returns <code>false</code>,
     *         otherwise.
     */
    public boolean isActive() {
        return workerThread.isAlive();
    }

    /**
     * Cleans up the internal resources used by this class instance.
     * <p>
     * <b>IMPORTANT</b>: it's necessary to call this method if the client wants
     * to this class to be properly unloaded from the JVM.
     * </p>
     */
    public void destroy() {
        if (workerThread != null) {
            try {
                if (workerThread.isAlive() == true) {
                    addIndex(POISON_DOC);
                }
            } catch (Exception e) {
                logger.error("SearchIndexManager destory(): " + e);
            }
        }
    }

    /**
     * Returns the profile used for configuring this class instance.
     * 
     * @return current profile
     */
    public SearchIndexProfile getProfile() {
        return profile;
    }

    /**
     * Sets the profile used for configuring the class instance.
     * 
     * @param profile
     *            profile to use
     */
    public void setProfile(SearchIndexProfile profile) {
        this.profile = profile;
    }

    /**
     * Appends a <code>Document</code> to the queue of documents to be updated
     * in the search index database. The update request is performed in an
     * asynchronous fashion. There is no guarantee that the update will be
     * completed right after this method has been called.
     * 
     * @param doc
     *            a document to be updated.
     * @throws InterruptedException
     * 
     */
    public void updateIndex(Document doc) throws InterruptedException {
        if (doc != null) {
            documentQueue.put(new Request(doc, UPD));
        }
    }

    /**
     * Appends a <code>Document</code> to the queue of documents to be added
     * in the search index database. The add request is performed in an
     * asynchronous fashion. There is no guarantee that the add will be
     * completed right after this method has been called.
     * 
     * @param doc
     *            a document to be added.
     * @throws InterruptedException
     * 
     */
    public void addIndex(Document doc) throws InterruptedException {
        if (doc != null) {
            documentQueue.put(new Request(doc, ADD));
        }
    }

    /**
     * Appends a <code>Document</code> to the queue of documents to be deleted
     * from the search index database. The delete request is performed in an
     * asynchronous fashion. There is no guarantee that the delete will be
     * completed right after this method has been called.
     * 
     * @param doc
     *            a document to be deleted.
     * @throws InterruptedException
     * 
     */
    public void deleteIndex(Document doc) throws InterruptedException {
        if (doc != null) {
            documentQueue.put(new Request(doc, DEL));
        }
    }

    /**
     * Instructs this class to clear all index records from the existing search
     * index database as soon as possible. This request is executed in an
     * asynchronous fashion. There is no guarantee that this request can be
     * completed right after the call.
     * 
     * @throws InterruptedException
     */
    public void resetIndex() throws InterruptedException {
        documentQueue.put(new Request(null, RST));
    }

    private static final int ADD = 1;
    private static final int DEL = 2;
    private static final int UPD = 3;
    private static final int RST = 4;

    private class Request {
        Document doc;
        int type;

        public Request(Document doc, int type) {
            this.doc = doc;
            this.type = type;
        }

        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("req:" + type + ",doc:" + doc);
            return sb.toString();
        }
    }

    /**
     * Finds the representative bookmark document for a ginve URL hash.
     * 
     * @param urlHash
     *            a URL MD5 Hash.
     * 
     * @return a Lucene document of the representative bookmark
     */
    public Document findLeadDocument(String urlHash) {
        IndexReader reader = null;
        TermDocs termDocs = null;
        Document leadDoc = null;
        try {
            boolean exists = IndexReader.indexExists(indexDirectory);
            if (exists == true) {
                reader = IndexReader.open(indexDirectory);
                Term key = new Term(DocumentCreator.FIELD_URL_MD5, urlHash);
                termDocs = reader.termDocs(key);
                boolean found = false;
                while (termDocs.next() && found == false) {
                    int pos = termDocs.doc();
                    // use FieldSelector for more efficient loading of Fields.
                    // load only what's needed to determine a leading document
                    Document d = reader.document(pos, new FieldSelector() {
                        private static final long serialVersionUID = 1426724242925499003L;

                        public FieldSelectorResult accept(String field) {
                            if (field.equals(DocumentCreator.FIELD_INDEX_TYPE)) {
                                return FieldSelectorResult.LOAD_AND_BREAK;
                            } else {
                                return FieldSelectorResult.NO_LOAD;
                            }
                        }
                    });
                    String[] values = d.getValues(DocumentCreator.FIELD_INDEX_TYPE);
                    if (values != null) {
                        List<String> vList = Arrays.asList(values);
                        if (vList.contains(DocumentCreator.INDEX_TYPE_LEAD) == true) {
                            leadDoc = reader.document(pos);
                            found = true;
                        }
                    }
                }
            }
        } catch (Exception e) {
            logger.error("FindLeadDocument failed to find doc: " + urlHash + ", exception=" + e);
        } finally {
            try {
                if (termDocs != null) {
                    termDocs.close();
                }
                if (reader != null) {
                    reader.close();
                }
            } catch (Exception e) {
                logger.error("FindLeadDocument can't close reader or termDocs: " + e);
            }
        }
        return leadDoc;
    }

    /**
     * Finds a non-representative bookmark document for a given URL hash.
     * 
     * @param urlHash
     *            a URL MD5 Hash.
     * 
     * @return a Lucene document of a non-representative bookmark
     */
    public Document findNonLeadDocument(String urlHash) {
        IndexReader reader = null;
        TermDocs termDocs = null;
        Document leadDoc = null;
        try {
            boolean exists = IndexReader.indexExists(indexDirectory);
            if (exists == true) {
                reader = IndexReader.open(indexDirectory);
                Term key = new Term(DocumentCreator.FIELD_URL_MD5, urlHash);
                termDocs = reader.termDocs(key);
                boolean found = false;
                while (termDocs.next() && found == false) {
                    int pos = termDocs.doc();
                    // use FieldSelector for more efficient loading of Fields.
                    // load only what's needed to determine a leading document
                    Document d = reader.document(pos, new FieldSelector() {
                        private static final long serialVersionUID = 1426724242925499003L;

                        public FieldSelectorResult accept(String field) {
                            if (field.equals(DocumentCreator.FIELD_INDEX_TYPE)) {
                                return FieldSelectorResult.LOAD_AND_BREAK;
                            } else {
                                return FieldSelectorResult.NO_LOAD;
                            }
                        }

                    });
                    String[] values = d.getValues(DocumentCreator.FIELD_INDEX_TYPE);
                    if (values != null) {
                        List<String> vList = Arrays.asList(values);
                        if (vList.contains(DocumentCreator.INDEX_TYPE_LEAD) == false) {
                            leadDoc = reader.document(pos);
                            found = true;
                        }
                    } else {
                        leadDoc = reader.document(pos);
                        found = true;
                    }
                }
            }
        } catch (Exception e) {
            logger.error("FindLeadDocument failed to find doc hash: " + urlHash + ", exception=" + e);
        } finally {
            try {
                if (termDocs != null) {
                    termDocs.close();
                }
                if (reader != null) {
                    reader.close();
                }
            } catch (Exception e) {
                logger.error("FindLeadDocument can't close reader or termDocs: " + e);
            }
        }
        return leadDoc;
    }

    private class UpdateIndexWorker implements Runnable {
        private static final int MAX_WORK_SIZE = Integer.MAX_VALUE;
        private Queue<Request> workQueue = new LinkedList<Request>();

        private Queue<Request> batchRequest(BlockingQueue<Request> inputQueue) throws InterruptedException {
            HashSet<String> seenMD5Url = new HashSet<String>();
            Queue<Request> batchWork = new LinkedList<Request>();
            boolean cutOff = false;
            boolean isFromTake = true;
            Request req = inputQueue.take();
            while (req != null && cutOff == false && batchWork.size() < MAX_WORK_SIZE) {
                Document doc = req.doc;
                if (doc != null) {
                    if (POISON_DOC.equals(doc) == true) {
                        cutOff = true;
                    } else {
                        String md5Url = doc.get(DocumentCreator.FIELD_URL_MD5);
                        if (md5Url != null) {
                            if (seenMD5Url.contains(md5Url) == true) {
                                cutOff = true;
                            } else {
                                seenMD5Url.add(md5Url);
                            }
                        }
                    }
                } else {
                    cutOff = true;
                }
                if (isFromTake == true) {
                    batchWork.add(req);
                    isFromTake = false;
                } else {
                    if (cutOff == false) {
                        batchWork.add(inputQueue.poll());
                    }
                }
                req = inputQueue.peek();
            }
            return batchWork;
        }

        private IndexWriter createIndexWriter()
                throws CorruptIndexException, LockObtainFailedException, IOException {
            Analyzer analyzer = DocumentCreator.createDocumentAnalyzer();
            return new IndexWriter(indexDirectory, analyzer);
        }

        public int getWorkQueueSize() {
            return workQueue.size();
        }

        public void run() {
            boolean stopRunning = false;
            while (true && stopRunning == false) {
                IndexWriter writer = null;
                try {
                    workQueue = batchRequest(documentQueue);
                    logger.debug("UpdateIndexWorker: batching # of Request :" + workQueue.size());
                    Request aReq = workQueue.poll();
                    if (aReq != null && aReq.type != RST) {
                        writer = createIndexWriter();
                    }
                    while (aReq != null) {
                        Document doc = aReq.doc;
                        if (doc != null && POISON_DOC.equals(doc)) {
                            logger.debug("Terminate UpdateIndexWorker.");
                            stopRunning = true;
                        } else if (aReq.type == RST) {
                            logger.debug("===================================> Do RESET.");
                            doReset();
                        } else {
                            if (aReq.type == ADD && doc != null) {

                                doAdd(doc, writer);
                            } else if (aReq.type == UPD && doc != null) {
                                doUpdate(doc, writer);
                            } else if (aReq.type == DEL && doc != null) {
                                doDelete(doc, writer);
                            }
                        }
                        aReq = workQueue.poll();
                    }
                } catch (InterruptedException e) {
                    logger.debug("UpdateIndexWorker is interrupted.");
                    stopRunning = true;
                } catch (Exception e) {
                    logger.error(e);
                } finally {
                    if (writer != null) {
                        try {
                            writer.optimize();
                            writer.flush();
                            writer.close();
                        } catch (Exception e) {
                            logger.error(e);
                        }
                    }
                }
            }
        }

        private void doReset() {
            createEmptyIndexDirectory();
        }

        private void doAdd(Document doc, IndexWriter writer) throws CorruptIndexException, IOException {
            if (doc == null) {
                throw new NullPointerException("Can't add document to the index. Doc is NULL");
            }
            String urlHash = doc.get(DocumentCreator.FIELD_URL_MD5);
            if (urlHash == null) {
                throw new NullPointerException(
                        "Can't add document to the index. Doc is missing URL hash. doc:" + doc);
            }
            Document leadDoc = findLeadDocument(urlHash);
            if (leadDoc == null) {
                doc = DocumentCreator.addIndexTypeLead(doc);
                logger.debug("Added Lead Index Type to doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
            }

            writer.addDocument(doc);
            logger.debug("Added doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
        }

        private void doDelete(Document doc, IndexWriter writer) throws CorruptIndexException, IOException {
            if (doc == null) {
                throw new NullPointerException("Can't delete document from the index. Doc is NULL");
            }

            Term t = new Term(DocumentCreator.FIELD_BOOKMARK_ID, doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
            writer.deleteDocuments(t);
            logger.debug("Deleted doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));

            String urlHash = doc.get(DocumentCreator.FIELD_URL_MD5);
            if (urlHash == null) {
                throw new NullPointerException(
                        "Can't delete document from the index. Doc is missing URL hash. doc:" + doc);
            }
            writer.flush();
            Document leadDoc = findLeadDocument(urlHash);
            if (leadDoc == null) {
                Document nonleadDoc = findNonLeadDocument(urlHash);
                if (nonleadDoc != null) {
                    logger.debug("After deleting found a new Lead document. doc = "
                            + nonleadDoc.get(DocumentCreator.FIELD_BOOKMARK_ID));
                    nonleadDoc = DocumentCreator.addIndexTypeLead(nonleadDoc);
                    doUpdate(nonleadDoc, writer);
                }
            }
        }

        private void doUpdate(Document doc, IndexWriter writer) throws CorruptIndexException, IOException {
            if (doc == null) {
                throw new NullPointerException("Can't update document in the index. Doc is NULL");
            }
            String urlHash = doc.get(DocumentCreator.FIELD_URL_MD5);
            if (urlHash == null) {
                throw new NullPointerException(
                        "Can't update document in the index. Doc is missing URL hash. doc:" + doc);
            }

            Term t = new Term(DocumentCreator.FIELD_BOOKMARK_ID, doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
            writer.updateDocument(t, doc);
            logger.debug("Updated doc = " + doc.get(DocumentCreator.FIELD_BOOKMARK_ID));
        }
    }

    /**
     * Returns the directory where the search index database is stored
     * 
     * @return the <code>File</code> that represents the storage location of
     *         the index. Even if the return value is not <code>null</code>,
     *         there is no guarantee that this directory exists in the system.
     * @return the directory of the search index database.
     */
    public File getIndexDirectory() {
        return indexDirectory;
    }

}