org.moxie.proxy.LuceneExecutor.java Source code

Introduction

Here is the source code for org.moxie.proxy.LuceneExecutor.java
Source

/*
 * Copyright 2012 James Moger
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.moxie.proxy;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.lang.reflect.Method;
import java.text.MessageFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Queue;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.moxie.IMavenCache;
import org.moxie.MoxieCache;
import org.moxie.Pom;
import org.moxie.PomReader;
import org.moxie.RemoteRepository;
import org.moxie.utils.FileUtils;
import org.moxie.utils.StringUtils;

/**
 * The Lucene executor handles indexing and searching POM files.
 * 
 * @author James Moger
 * 
 */
public class LuceneExecutor implements Runnable {

    private static final int INDEX_VERSION = 1;

    private static final String FIELD_PACKAGING = "type";
    private static final String FIELD_GROUPID = "groupid";
    private static final String FIELD_ARTIFACTID = "artifactid";
    private static final String FIELD_VERSION = "version";
    private static final String FIELD_NAME = "name";
    private static final String FIELD_DESCRIPTION = "description";
    private static final String FIELD_DATE = "date";

    private static final String LUCENE_DIR = "lucene";
    private static final String CONF_VERSION = "version";

    private static final Version LUCENE_VERSION = Version.LUCENE_35;

    private final Logger logger = Logger.getLogger(LuceneExecutor.class.getSimpleName());

    private final ProxyConfig config;
    private final File indexesFolder;

    private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>();
    private final Map<String, IndexWriter> writers = new ConcurrentHashMap<String, IndexWriter>();

    private final Queue<IndexPom> queue;

    public LuceneExecutor(ProxyConfig config) {
        this.config = config;
        this.indexesFolder = new File(config.getMoxieRoot(), LUCENE_DIR);
        queue = new ConcurrentLinkedQueue<IndexPom>();
    }

    /**
     * Reindex is a blocking call which synchronously rebuilds each repository's
     * Lucene index.
     */
    public synchronized void reindex() {
        for (String repository : config.getLocalRepositories()) {
            index(repository);
        }
        for (RemoteRepository repository : config.getRemoteRepositories()) {
            index(repository.id);
        }
        System.gc();
    }

    /**
     * Run is executed by a scheduled executor service at a fixed rate. This
     * guarantees no concurrent repository index updates.  Index updates are
     * queued and processed asynchronously by the executor service.
     */
    @Override
    public void run() {
        if (queue.isEmpty()) {
            return;
        }

        Set<String> repositories = new TreeSet<String>();
        long minDiff = 60 * 1000L; // 1 min
        while (!queue.isEmpty()) {
            IndexPom pom = queue.peek();
            // Wait till oldest element has been in queue for minimum time.
            // 
            // This is a practical workaround for expecting parent pom files
            // to have been retrieved.  The alternative is to make the proxy
            // smart enough to identify and retrieve parent poms.  The current
            // design relies on the client instructing the proxy to retrieve
            // parent poms.
            while ((System.currentTimeMillis() - pom.date.getTime()) < minDiff) {
                try {
                    Thread.sleep(200);
                } catch (InterruptedException e) {
                    // ignore
                }
            }

            queue.poll();
            logger.info("indexing " + pom.file);
            incrementalIndex(pom.file);

            // cache the repository that we just indexed
            String repository = config.getRepositoryId(pom.file);
            repositories.add(repository);
        }

        // create/update the prefix indexes for the repositories
        for (String repository : repositories) {
            IMavenCache cache = config.getMavenCache(repository);
            cache.updatePrefixesIndex();
        }
    }

    /**
     * Reads the Lucene config file for the repository to check the index
     * version. If the index version is different, then rebuild the repository
     * index.
     * 
     * @param repository
     * @return true of the on-disk index format is different than INDEX_VERSION
     */
    private boolean shouldReindex(String repository) {
        try {
            File folder = new File(indexesFolder, LUCENE_DIR);
            File file = new File(folder, "config.properties");
            Properties props = new Properties();
            props.load(new FileReader(file));
            int indexVersion = Integer.parseInt(props.getProperty(CONF_VERSION, "0"));
            // reindex if versions do not match
            return indexVersion != INDEX_VERSION;
        } catch (Throwable t) {
        }
        return true;
    }

    /**
     * Synchronously indexes a repository. This may build a complete index of a
     * repository or it may update an existing index.
     * 
     * @param name
     *            the name of the repository
     * @param repository
     *            the repository object
     */
    private void index(String repository) {
        try {
            if (shouldReindex(repository)) {
                // (re)build the entire index
                IndexResult result = reindex(repository);

                if (result.success) {
                    if (result.artifactCount > 0) {
                        String msg = "Built {0} Lucene index from {1} artifacts in {2} secs";
                        logger.info(MessageFormat.format(msg, repository, result.artifactCount, result.duration()));
                    }
                } else {
                    String msg = "Could not build {0} Lucene index!";
                    logger.severe(MessageFormat.format(msg, repository));
                }
            } else {
                // update the index with latest artifacts
                IndexResult result = updateIndex(repository);
                if (result.success) {
                    if (result.artifactCount > 0) {
                        String msg = "Updated {0} Lucene index with {1} artifacts in {2} secs";
                        logger.info(MessageFormat.format(msg, repository, result.artifactCount, result.duration()));
                    }
                } else {
                    String msg = "Could not update {0} Lucene index!";
                    logger.severe(MessageFormat.format(msg, repository));
                }
            }

            // create/update the prefix indexes for the repositories
            IMavenCache cache = config.getMavenCache(repository);
            cache.updatePrefixesIndex();
        } catch (Throwable t) {
            logger.log(Level.SEVERE, MessageFormat.format("Lucene indexing failure for {0}", repository), t);
        }
    }

    /**
     * Updates a repository index incrementally from the last indexed artifacts.
     * 
     * @param repository
     * @return IndexResult
     */
    private IndexResult updateIndex(String repository) {
        IndexResult result = new IndexResult();
        return result;
    }

    /**
     * Close the writer/searcher objects for a repository.
     * 
     * @param repositoryName
     */
    public synchronized void close(String repositoryName) {
        try {
            IndexSearcher searcher = searchers.remove(repositoryName);
            if (searcher != null) {
                searcher.getIndexReader().close();
            }
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Failed to close index searcher for " + repositoryName, e);
        }

        try {
            IndexWriter writer = writers.remove(repositoryName);
            if (writer != null) {
                writer.close();
            }
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Failed to close index writer for " + repositoryName, e);
        }
    }

    /**
     * Close all Lucene indexers.
     * 
     */
    public synchronized void close() {
        // close all writers
        for (String writer : writers.keySet()) {
            try {
                writers.get(writer).close(true);
            } catch (Throwable t) {
                logger.log(Level.SEVERE, "Failed to close Lucene writer for " + writer, t);
            }
        }
        writers.clear();

        // close all searchers
        for (String searcher : searchers.keySet()) {
            try {
                searchers.get(searcher).getIndexReader().close();
            } catch (Throwable t) {
                logger.log(Level.SEVERE, "Failed to close Lucene searcher for " + searcher, t);
            }
        }
        searchers.clear();
    }

    /**
     * Deletes the Lucene index for the specified repository.
     * 
     * @param repositoryName
     * @return true, if successful
     */
    public boolean deleteIndex(String repositoryName) {
        // close any open writer/searcher
        close(repositoryName);

        // delete the index folder
        File luceneIndex = new File(indexesFolder, repositoryName);
        if (luceneIndex.exists()) {
            FileUtils.delete(luceneIndex);
        }
        return true;
    }

    /**
     * This completely indexes the repository and will destroy any existing
     * index.
     * 
     * @param repositoryName
     * @return IndexResult
     */
    public IndexResult reindex(String repository) {
        IndexResult result = new IndexResult();
        if (!deleteIndex(repository)) {
            return result;
        }
        try {
            MoxieCache moxieCache = config.getMoxieCache();
            IMavenCache repositoryCache = config.getMavenCache(repository);
            Collection<File> files = repositoryCache.getFiles("." + org.moxie.Constants.POM);
            IndexWriter writer = getIndexWriter(repository);

            for (File pomFile : files) {
                try {
                    Pom pom = PomReader.readPom(moxieCache, pomFile);
                    String date = DateTools.timeToString(pomFile.lastModified(), Resolution.MINUTE);

                    Document doc = new Document();
                    doc.add(new Field(FIELD_PACKAGING, pom.packaging, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
                    doc.add(new Field(FIELD_GROUPID, pom.groupId, Store.YES, Index.ANALYZED));
                    doc.add(new Field(FIELD_ARTIFACTID, pom.artifactId, Store.YES, Index.ANALYZED));
                    doc.add(new Field(FIELD_VERSION, pom.version, Store.YES, Index.ANALYZED));
                    if (!StringUtils.isEmpty(pom.name)) {
                        doc.add(new Field(FIELD_NAME, pom.name, Store.YES, Index.ANALYZED));
                    }
                    if (!StringUtils.isEmpty(pom.description)) {
                        doc.add(new Field(FIELD_DESCRIPTION, pom.description, Store.YES, Index.ANALYZED));
                    }
                    doc.add(new Field(FIELD_DATE, date, Store.YES, Index.ANALYZED));

                    // add the pom to the index
                    writer.addDocument(doc);
                } catch (Exception e) {
                    logger.log(Level.SEVERE,
                            MessageFormat.format("Exception while reindexing {0} in {1}", pomFile, repository), e);
                }
                result.artifactCount++;
            }

            writer.commit();
            resetIndexSearcher(repository);
            result.success();
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Exception while reindexing " + repository, e);
        }
        return result;
    }

    /**
     * Incrementally update the index.
     * 
     * @return pomFile
     */
    public void index(File pomFile) {
        queue.add(new IndexPom(pomFile));
    }

    /**
     * Incrementally update the index.
     * 
     * @return pomFile
     */
    private void incrementalIndex(File pomFile) {
        try {
            String repository = config.getRepositoryId(pomFile);
            IMavenCache cache = config.getMavenCache(repository);
            Pom pom = PomReader.readPom(cache, pomFile);

            delete(repository, pom);

            IndexWriter writer = getIndexWriter(repository);

            Document doc = new Document();
            doc.add(new Field(FIELD_PACKAGING, pom.packaging, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
            doc.add(new Field(FIELD_GROUPID, pom.groupId, Store.YES, Index.ANALYZED));
            doc.add(new Field(FIELD_ARTIFACTID, pom.artifactId, Store.YES, Index.ANALYZED));
            doc.add(new Field(FIELD_VERSION, pom.version, Store.YES, Index.ANALYZED));
            if (!StringUtils.isEmpty(pom.name)) {
                doc.add(new Field(FIELD_NAME, pom.name, Store.YES, Index.ANALYZED));
            }
            if (!StringUtils.isEmpty(pom.description)) {
                doc.add(new Field(FIELD_DESCRIPTION, pom.description, Store.YES, Index.ANALYZED));
            }
            String date = DateTools.timeToString(pomFile.lastModified(), Resolution.MINUTE);
            doc.add(new Field(FIELD_DATE, date, Store.YES, Index.ANALYZED));

            // add the pom to the index
            writer.addDocument(doc);

            writer.commit();
            resetIndexSearcher(repository);

            config.resetRepositorySize(repository);
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Exception while indexing " + pomFile, e);
        }
    }

    private boolean delete(String repository, Pom pom) throws IOException {
        BooleanQuery query = new BooleanQuery();
        Term groupTerm = new Term(FIELD_GROUPID, pom.groupId);
        query.add(new TermQuery(groupTerm), Occur.MUST);
        Term artifactTerm = new Term(FIELD_ARTIFACTID, pom.artifactId);
        query.add(new TermQuery(artifactTerm), Occur.MUST);
        Term versionTerm = new Term(FIELD_VERSION, pom.version);
        query.add(new TermQuery(versionTerm), Occur.MUST);

        IndexWriter writer = getIndexWriter(repository);
        int numDocsBefore = writer.numDocs();
        writer.deleteDocuments(query);
        writer.commit();
        int numDocsAfter = writer.numDocs();
        if (numDocsBefore == numDocsAfter) {
            logger.fine(MessageFormat.format("no records found to delete {0}", query.toString()));
            return false;
        } else {
            logger.fine(MessageFormat.format("deleted {0} records with {1}", numDocsBefore - numDocsAfter,
                    query.toString()));
            return true;
        }
    }

    private SearchResult createSearchResult(Document doc, int hitId, int totalHits) throws ParseException {
        SearchResult result = new SearchResult();
        result.hitId = hitId;
        result.totalHits = totalHits;
        result.date = DateTools.stringToDate(doc.get(FIELD_DATE));
        result.groupId = doc.get(FIELD_GROUPID);
        result.artifactId = doc.get(FIELD_ARTIFACTID);
        result.version = doc.get(FIELD_VERSION);
        result.packaging = doc.get(FIELD_PACKAGING);
        result.name = doc.get(FIELD_NAME);
        result.description = doc.get(FIELD_DESCRIPTION);
        return result;
    }

    private synchronized void resetIndexSearcher(String repository) throws IOException {
        IndexSearcher searcher = searchers.remove(repository);
        if (searcher != null) {
            searcher.getIndexReader().close();
        }
    }

    /**
     * Gets an index searcher for the repository.
     * 
     * @param repository
     * @return
     * @throws IOException
     */
    private IndexSearcher getIndexSearcher(String repository) throws IOException {
        IndexSearcher searcher = searchers.get(repository);
        if (searcher == null) {
            IndexWriter writer = getIndexWriter(repository);
            searcher = new IndexSearcher(IndexReader.open(writer, true));
            searchers.put(repository, searcher);
        }
        return searcher;
    }

    /**
     * Gets an index writer for the repository. The index will be created if it
     * does not already exist or if forceCreate is specified.
     * 
     * @param repository
     * @return an IndexWriter
     * @throws IOException
     */
    private IndexWriter getIndexWriter(String repository) throws IOException {
        IndexWriter indexWriter = writers.get(repository);
        File indexFolder = new File(indexesFolder, repository);
        Directory directory = FSDirectory.open(indexFolder);

        if (indexWriter == null) {
            if (!indexFolder.exists()) {
                indexFolder.mkdirs();
            }
            StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
            IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer);
            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
            indexWriter = new IndexWriter(directory, config);
            writers.put(repository, indexWriter);
        }
        return indexWriter;
    }

    /**
     * Searches the specified repositories for the given text or query
     * 
     * @param text
     *            if the text is null or empty, null is returned
     * @param page
     *            the page number to retrieve. page is 1-indexed.
     * @param pageSize
     *            the number of elements to return for this page
     * @param repositories
     *            a list of repositories to search. if no repositories are
     *            specified null is returned.
     * @return a list of SearchResults in order from highest to the lowest score
     * 
     */
    public List<SearchResult> search(String text, int page, int pageSize, List<String> repositories) {
        if (repositories == null || repositories.size() == 0) {
            return null;
        }
        return search(text, page, pageSize, repositories.toArray(new String[0]));
    }

    /**
     * Searches the specified repositories for the given text or query
     * 
     * @param text
     *            if the text is null or empty, null is returned
     * @param page
     *            the page number to retrieve. page is 1-indexed.
     * @param pageSize
     *            the number of elements to return for this page
     * @param repositories
     *            a list of repositories to search. if no repositories are
     *            specified null is returned.
     * @return a list of SearchResults in order from highest to the lowest score
     * 
     */
    public List<SearchResult> search(String text, int page, int pageSize, String... repositories) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        if (repositories == null || repositories.length == 0) {
            return null;
        }
        Set<SearchResult> results = new LinkedHashSet<SearchResult>();
        StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
        try {
            // default search checks groupId and artifactId
            BooleanQuery query = new BooleanQuery();
            QueryParser qp;
            qp = new QueryParser(LUCENE_VERSION, FIELD_GROUPID, analyzer);
            qp.setAllowLeadingWildcard(true);
            query.add(qp.parse(text), Occur.SHOULD);

            qp = new QueryParser(LUCENE_VERSION, FIELD_ARTIFACTID, analyzer);
            qp.setAllowLeadingWildcard(true);
            query.add(qp.parse(text), Occur.SHOULD);

            IndexSearcher searcher;
            if (repositories.length == 1) {
                // single repository search
                searcher = getIndexSearcher(repositories[0]);
            } else {
                // multiple repository search
                List<IndexReader> readers = new ArrayList<IndexReader>();
                for (String repository : repositories) {
                    IndexSearcher repositoryIndex = getIndexSearcher(repository);
                    readers.add(repositoryIndex.getIndexReader());
                }
                IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]);
                MultiSourceReader reader = new MultiSourceReader(rdrs);
                searcher = new IndexSearcher(reader);
            }
            Query rewrittenQuery = searcher.rewrite(query);
            Sort sort = new Sort(new SortField(FIELD_DATE, SortField.STRING, true));
            TopFieldDocs topDocs = searcher.search(rewrittenQuery, 10000, sort);
            int offset = Math.max(0, (page - 1) * pageSize);
            ScoreDoc[] hits = topDocs.scoreDocs;
            int totalHits = topDocs.totalHits;
            if (pageSize <= 0) {
                pageSize = totalHits;
            }
            if (totalHits > offset) {
                for (int i = offset, len = Math.min(offset + pageSize, hits.length); i < len; i++) {
                    int docId = hits[i].doc;
                    Document doc = searcher.doc(docId);
                    SearchResult result = createSearchResult(doc, i + 1, totalHits);
                    if (repositories.length == 1) {
                        // single repository search
                        result.repository = repositories[0];
                    } else {
                        // multi-repository search
                        MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader();
                        int index = reader.getSourceIndex(docId);
                        result.repository = repositories[index];
                    }
                    results.add(result);
                }
            }
        } catch (Exception e) {
            logger.log(Level.SEVERE, MessageFormat.format("Exception while searching for {0}", text), e);
        }
        return new ArrayList<SearchResult>(results);
    }

    /**
     * Simple class to track the results of an index update.
     */
    private class IndexResult {
        long startTime = System.currentTimeMillis();
        long endTime = startTime;
        boolean success;
        int artifactCount;

        void success() {
            success = true;
            endTime = System.currentTimeMillis();
        }

        float duration() {
            return (endTime - startTime) / 1000f;
        }
    }

    /**
     * Custom subclass of MultiReader to identify the source index for a given
     * doc id. This would not be necessary of there was a public method to
     * obtain this information.
     * 
     */
    private class MultiSourceReader extends MultiReader {

        final Method method;

        MultiSourceReader(IndexReader[] subReaders) {
            super(subReaders);
            Method m = null;
            try {
                m = MultiReader.class.getDeclaredMethod("readerIndex", int.class);
                m.setAccessible(true);
            } catch (Exception e) {
                logger.log(Level.SEVERE, "Error getting readerIndex method", e);
            }
            method = m;
        }

        int getSourceIndex(int docId) {
            int index = -1;
            try {
                Object o = method.invoke(this, docId);
                index = (Integer) o;
            } catch (Exception e) {
                logger.log(Level.SEVERE, "Error getting source index", e);
            }
            return index;
        }
    }

    private class IndexPom {
        final File file;
        final Date date;

        IndexPom(File file) {
            this.file = file;
            this.date = new Date();
        }
    }
}