uk.co.flax.luwak.Monitor.java Source code

Java tutorial

Introduction

Here is the source code for uk.co.flax.luwak.Monitor.java

Source

package uk.co.flax.luwak;

import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;

import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.SpanExtractor;
import org.apache.lucene.search.spans.SpanRewriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import uk.co.flax.luwak.presearcher.PresearcherMatches;

/*
 * Copyright (c) 2015 Lemur Consulting Ltd.
 * <p/>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A Monitor contains a set of MonitorQuery objects, and runs them against
 * passed-in InputDocuments.
 */
public class Monitor implements Closeable {

    protected final MonitorQueryParser queryParser;
    protected final Presearcher presearcher;
    protected final QueryDecomposer decomposer;

    private final QueryIndex queryIndex;

    private final List<QueryIndexUpdateListener> listeners = new ArrayList<>();

    protected long slowLogLimit = 2000000;

    private final long commitBatchSize;
    private final boolean storeQueries;

    public static final class FIELDS {
        public static final String id = "_id";
        public static final String del = "_del";
        public static final String hash = "_hash";
        public static final String mq = "_mq";
    }

    private final ScheduledExecutorService purgeExecutor;

    private long lastPurged = -1;

    /**
     * Create a new Monitor instance, using a passed in IndexWriter for its queryindex
     *
     * Note that when the Monitor is closed, both the IndexWriter and its underlying
     * Directory will also be closed.
     *
     * @param queryParser the query parser to use
     * @param presearcher the presearcher to use
     * @param indexWriter an indexWriter for the query index
     * @param configuration the MonitorConfiguration
     * @throws IOException on IO errors
     */
    public Monitor(MonitorQueryParser queryParser, Presearcher presearcher, IndexWriter indexWriter,
            QueryIndexConfiguration configuration) throws IOException {

        this.queryParser = queryParser;
        this.presearcher = presearcher;
        this.decomposer = configuration.getQueryDecomposer();

        this.queryIndex = new QueryIndex(indexWriter);

        this.storeQueries = configuration.storeQueries();
        prepareQueryCache(this.storeQueries);

        long purgeFrequency = configuration.getPurgeFrequency();
        this.purgeExecutor = Executors.newSingleThreadScheduledExecutor();
        this.purgeExecutor.scheduleAtFixedRate(new Runnable() {
            @Override
            public void run() {
                try {
                    purgeCache();
                } catch (Throwable e) {
                    afterPurgeError(e);
                }
            }
        }, purgeFrequency, purgeFrequency, configuration.getPurgeFrequencyUnits());

        this.commitBatchSize = configuration.getQueryUpdateBufferSize();
    }

    /**
     * Create a new Monitor instance, using a RAMDirectory and the default configuration
     * @param queryParser the query parser to use
     * @param presearcher the presearcher to use
     * @throws IOException on IO errors
     */
    public Monitor(MonitorQueryParser queryParser, Presearcher presearcher) throws IOException {
        this(queryParser, presearcher, defaultIndexWriter(new RAMDirectory()), new QueryIndexConfiguration());
    }

    /**
     * Create a new Monitor instance using a RAMDirectory
     * @param queryParser the query parser to use
     * @param presearcher the presearcher to use
     * @param config the monitor configuration
     * @throws IOException on IO errors
     */
    public Monitor(MonitorQueryParser queryParser, Presearcher presearcher, QueryIndexConfiguration config)
            throws IOException {
        this(queryParser, presearcher, defaultIndexWriter(new RAMDirectory()), config);
    }

    /**
     * Create a new Monitor instance, using the default QueryDecomposer and IndexWriter configuration
     * @param queryParser the query parser to use
     * @param presearcher the presearcher to use
     * @param directory the directory where the queryindex is stored
     * @throws IOException on IO errors
     */
    public Monitor(MonitorQueryParser queryParser, Presearcher presearcher, Directory directory)
            throws IOException {
        this(queryParser, presearcher, defaultIndexWriter(directory), new QueryIndexConfiguration());
    }

    /**
     * Create a new Monitor instance
     * @param queryParser the query parser to use
     * @param presearcher the presearcher to use
     * @param directory the directory where the queryindex is to be stored
     * @param config the monitor configuration
     * @throws IOException on IO errors
     */
    public Monitor(MonitorQueryParser queryParser, Presearcher presearcher, Directory directory,
            QueryIndexConfiguration config) throws IOException {
        this(queryParser, presearcher, defaultIndexWriter(directory), config);
    }

    /**
     * Create a new Monitor instance, using the default QueryDecomposer
     * @param queryParser the query parser to use
     * @param presearcher the presearcher to use
     * @param indexWriter a {@link IndexWriter} for the Monitor's query index
     * @throws IOException on IO errors
     */
    public Monitor(MonitorQueryParser queryParser, Presearcher presearcher, IndexWriter indexWriter)
            throws IOException {
        this(queryParser, presearcher, indexWriter, new QueryIndexConfiguration());
    }

    // package-private for testing
    static IndexWriter defaultIndexWriter(Directory directory) throws IOException {

        IndexWriterConfig iwc = new IndexWriterConfig(new KeywordAnalyzer());
        TieredMergePolicy mergePolicy = new TieredMergePolicy();
        mergePolicy.setSegmentsPerTier(4);
        iwc.setMergePolicy(mergePolicy);
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

        return new IndexWriter(directory, iwc);

    }

    /**
     * Register a {@link QueryIndexUpdateListener} that will be notified whenever changes
     * are made to the Monitor's queryindex
     */
    public void addQueryIndexUpdateListener(QueryIndexUpdateListener listener) {
        listeners.add(listener);
    }

    /**
     * @return Statistics for the internal query index and cache
     */
    public QueryCacheStats getQueryCacheStats() {
        return new QueryCacheStats(queryIndex.numDocs(), queryIndex.cacheSize(), lastPurged);
    }

    /**
     * Statistics for the query cache and query index
     */
    public static class QueryCacheStats {

        /** Total number of queries in the query index */
        public final int queries;

        /** Total number of queries int the query cache */
        public final int cachedQueries;

        /** Time the query cache was last purged */
        public final long lastPurged;

        public QueryCacheStats(int queries, int cachedQueries, long lastPurged) {
            this.queries = queries;
            this.cachedQueries = cachedQueries;
            this.lastPurged = lastPurged;
        }
    }

    private void prepareQueryCache(boolean storeQueries) throws IOException {

        if (storeQueries == false) {
            // we're not storing the queries, so ensure that the queryindex is empty
            // before we add any.
            clear();
            return;
        }

        // load any queries that have already been added to the queryindex
        final List<Exception> parseErrors = new LinkedList<>();
        final Set<BytesRef> seenHashes = new HashSet<>();
        final Set<String> seenIds = new HashSet<>();

        queryIndex.purgeCache(new QueryIndex.CachePopulator() {
            @Override
            public void populateCacheWithIndex(final Map<BytesRef, QueryCacheEntry> newCache) throws IOException {
                queryIndex.scan(new QueryIndex.QueryCollector() {
                    @Override
                    public void matchQuery(String id, QueryCacheEntry query, QueryIndex.DataValues dataValues)
                            throws IOException {
                        if (seenIds.contains(id)) {
                            return;
                        }
                        seenIds.add(id);

                        BytesRef serializedMQ = dataValues.mq.get(dataValues.doc);
                        MonitorQuery mq = MonitorQuery.deserialize(serializedMQ);

                        BytesRef hash = mq.hash();
                        if (seenHashes.contains(hash)) {
                            return;
                        }
                        seenHashes.add(hash);

                        try {
                            for (QueryCacheEntry ce : decomposeQuery(mq)) {
                                newCache.put(ce.hash, ce);
                            }
                        } catch (Exception e) {
                            parseErrors.add(e);
                        }
                    }
                });
            }
        });
        if (parseErrors.size() != 0)
            throw new IOException("Error populating cache - some queries couldn't be parsed:" + parseErrors);
    }

    private void commit(List<Indexable> updates) throws IOException {
        beforeCommit(updates);
        queryIndex.commit(updates);
        afterCommit(updates);
    }

    private void afterPurge() {
        for (QueryIndexUpdateListener listener : listeners) {
            listener.onPurge();
        }
    }

    private void afterPurgeError(Throwable t) {
        for (QueryIndexUpdateListener listener : listeners) {
            listener.onPurgeError(t);
        }
    }

    private void beforeCommit(List<Indexable> updates) {
        if (updates == null) {
            for (QueryIndexUpdateListener listener : listeners) {
                listener.beforeDelete();
            }
        } else {
            for (QueryIndexUpdateListener listener : listeners) {
                listener.beforeUpdate(updates);
            }
        }
    }

    private void afterCommit(List<Indexable> updates) {
        if (updates == null) {
            for (QueryIndexUpdateListener listener : listeners) {
                listener.afterDelete();
            }
        } else {
            for (QueryIndexUpdateListener listener : listeners) {
                listener.afterUpdate(updates);
            }
        }
    }

    /**
     * Remove unused queries from the query cache.
     *
     * This is normally called from a background thread at a rate set by configurePurgeFrequency().
     *
     * @throws IOException on IO errors
     */
    public void purgeCache() throws IOException {
        queryIndex.purgeCache(new QueryIndex.CachePopulator() {
            @Override
            public void populateCacheWithIndex(final Map<BytesRef, QueryCacheEntry> newCache) throws IOException {
                queryIndex.scan(new QueryIndex.QueryCollector() {
                    @Override
                    public void matchQuery(String id, QueryCacheEntry query, QueryIndex.DataValues dataValues)
                            throws IOException {
                        if (query != null)
                            newCache.put(BytesRef.deepCopyOf(query.hash), query);
                    }
                });
            }
        });

        lastPurged = System.nanoTime();
        afterPurge();
    }

    /**
     * Set the slow log limit
     *
     * All queries that take longer than t nanoseconds to run will be recorded in
     * the slow log.  The default is 2,000,000 (2 milliseconds)
     *
     * @param limit the limit in nanoseconds
     *
     * @see Matches#getSlowLog()
     */
    public void setSlowLogLimit(long limit) {
        this.slowLogLimit = limit;
    }

    @Override
    public void close() throws IOException {
        purgeExecutor.shutdown();
        queryIndex.closeWhileHandlingException();
    }

    /**
     * Add new queries to the monitor
     * @param queries the MonitorQueries to add
     * @return a list of exceptions for queries that could not be added
     * @throws IOException on IO errors
     */
    public List<QueryError> update(Iterable<MonitorQuery> queries) throws IOException {

        List<QueryError> errors = new ArrayList<>();
        List<Indexable> updates = new ArrayList<>();

        for (MonitorQuery query : queries) {
            try {
                for (QueryCacheEntry queryCacheEntry : decomposeQuery(query)) {
                    updates.add(new Indexable(query.getId(), queryCacheEntry,
                            buildIndexableQuery(query.getId(), query, queryCacheEntry)));
                }
            } catch (Exception e) {
                errors.add(new QueryError(query.getId(), query.getQuery(), e.getMessage()));
            }
            if (updates.size() > commitBatchSize) {
                commit(updates);
                updates.clear();
            }
        }

        commit(updates);
        return errors;
    }

    private Iterable<QueryCacheEntry> decomposeQuery(MonitorQuery query) throws Exception {

        Query q = queryParser.parse(query.getQuery(), query.getMetadata());

        BytesRef rootHash = query.hash();

        int upto = 0;
        List<QueryCacheEntry> cacheEntries = new LinkedList<>();
        for (Query subquery : decomposer.decompose(q)) {
            BytesRefBuilder subHash = new BytesRefBuilder();
            subHash.append(rootHash);
            subHash.append(new BytesRef("_" + upto++));
            cacheEntries.add(new QueryCacheEntry(subHash.toBytesRef(), subquery, query.getMetadata()));
        }

        return cacheEntries;
    }

    /**
     * Add new queries to the monitor
     * @param queries the MonitorQueries to add
     * @return a list of exceptions for queries that could not be added
     * @throws IOException on IO errors
     */
    public List<QueryError> update(MonitorQuery... queries) throws IOException {
        return update(Arrays.asList(queries));
    }

    /**
     * Delete queries from the monitor
     * @param queries the queries to remove
     * @throws IOException on IO errors
     */
    public void delete(Iterable<MonitorQuery> queries) throws IOException {
        for (MonitorQuery mq : queries) {
            queryIndex.deleteDocuments(new Term(Monitor.FIELDS.del, mq.getId()));
        }
        commit(null);
    }

    /**
     * Delete queries from the monitor by ID
     * @param queryIds the IDs to delete
     * @throws IOException on IO errors
     */
    public void deleteById(Iterable<String> queryIds) throws IOException {
        for (String queryId : queryIds) {
            queryIndex.deleteDocuments(new Term(FIELDS.del, queryId));
        }
        commit(null);
    }

    /**
     * Delete queries from the monitor by ID
     * @param queryIds the IDs to delete
     * @throws IOException on IO errors
     */
    public void deleteById(String... queryIds) throws IOException {
        deleteById(Arrays.asList(queryIds));
    }

    /**
     * Delete all queries from the monitor
     * @throws IOException on IO errors
     */
    public void clear() throws IOException {
        queryIndex.deleteDocuments(new MatchAllDocsQuery());
        commit(null);
    }

    /**
     * Match a {@link DocumentBatch} against the queryindex, calling a {@link CandidateMatcher} produced by the
     * supplied {@link MatcherFactory} for each possible matching query.
     * @param docs the DocumentBatch to match
     * @param factory a {@link MatcherFactory} to use to create a {@link CandidateMatcher} for the match run
     * @param <T> the type of {@link QueryMatch} to return
     * @return a {@link Matches} object summarizing the match run.
     * @throws IOException on IO errors
     */
    public <T extends QueryMatch> Matches<T> match(DocumentBatch docs, MatcherFactory<T> factory)
            throws IOException {
        CandidateMatcher<T> matcher = factory.createMatcher(docs);
        matcher.setSlowLogLimit(slowLogLimit);
        match(matcher);
        return matcher.getMatches();
    }

    /**
     * Match a single {@link InputDocument} against the queryindex, calling a {@link CandidateMatcher} produced by the
     * supplied {@link MatcherFactory} for each possible matching query.
     * @param doc the InputDocument to match
     * @param factory a {@link MatcherFactory} to use to create a {@link CandidateMatcher} for the match run
     * @param <T> the type of {@link QueryMatch} to return
     * @return a {@link Matches} object summarizing the match run.
     * @throws IOException on IO errors
     */
    public <T extends QueryMatch> Matches<T> match(InputDocument doc, MatcherFactory<T> factory)
            throws IOException {
        return match(DocumentBatch.of(doc), factory);
    }

    private class PresearcherQueryBuilder implements QueryIndex.QueryBuilder {

        final LeafReader batchIndexReader;

        private PresearcherQueryBuilder(LeafReader batchIndexReader) {
            this.batchIndexReader = batchIndexReader;
        }

        @Override
        public Query buildQuery(QueryTermFilter termFilter) throws IOException {
            return presearcher.buildQuery(batchIndexReader, termFilter);
        }
    }

    private <T extends QueryMatch> void match(CandidateMatcher<T> matcher) throws IOException {
        StandardQueryCollector<T> collector = new StandardQueryCollector<>(matcher);
        long buildTime = queryIndex.search(new PresearcherQueryBuilder(matcher.getIndexReader()), collector);
        matcher.finish(buildTime, collector.queryCount);
    }

    /**
     * Get the MonitorQuery for a given query id
     * @param queryId the id of the query to get
     * @return the MonitorQuery stored for this id, or null if not found
     * @throws IOException on IO errors
     * @throws IllegalStateException if queries are not stored in the queryindex
     */
    public MonitorQuery getQuery(final String queryId) throws IOException {
        if (storeQueries == false)
            throw new IllegalStateException("Cannot call getQuery() as queries are not stored");
        final MonitorQuery[] queryHolder = new MonitorQuery[] { null };
        queryIndex.search(new TermQuery(new Term(FIELDS.id, queryId)), new QueryIndex.QueryCollector() {
            @Override
            public void matchQuery(String id, QueryCacheEntry query, QueryIndex.DataValues dataValues)
                    throws IOException {
                BytesRef serializedMQ = dataValues.mq.get(dataValues.doc);
                queryHolder[0] = MonitorQuery.deserialize(serializedMQ);
            }
        });
        return queryHolder[0];
    }

    /**
     * @return the number of queries (after decomposition) stored in this Monitor
     */
    public int getDisjunctCount() {
        return queryIndex.numDocs();
    }

    /**
     * @return the number of queries stored in this Monitor
     * @throws IOException on IO errors
     */
    public int getQueryCount() throws IOException {
        return getQueryIds().size();
    }

    /**
     * @return the set of query ids of the queries stored in this Monitor
     * @throws IOException on IO errors
     */
    public Set<String> getQueryIds() throws IOException {
        final Set<String> ids = new HashSet<>();
        queryIndex.scan(new QueryIndex.QueryCollector() {
            @Override
            public void matchQuery(String id, QueryCacheEntry query, QueryIndex.DataValues dataValues)
                    throws IOException {
                ids.add(id);
            }
        });
        return ids;
    }

    /**
     * Build a lucene {@link Document} to be stored in the queryindex from a query entry
     * @param id the query id
     * @param mq the MonitorQuery to be indexed
     * @param query the (possibly partial after decomposition) query to be indexed
     * @return a Document that will be indexed in the Monitor's queryindex
     */
    protected Document buildIndexableQuery(String id, MonitorQuery mq, QueryCacheEntry query) {
        Document doc = presearcher.indexQuery(query.matchQuery, mq.getMetadata());
        doc.add(new StringField(FIELDS.id, id, Field.Store.NO));
        doc.add(new StringField(FIELDS.del, id, Field.Store.NO));
        doc.add(new SortedDocValuesField(FIELDS.id, new BytesRef(id)));
        doc.add(new BinaryDocValuesField(FIELDS.hash, query.hash));
        if (storeQueries)
            doc.add(new BinaryDocValuesField(FIELDS.mq, MonitorQuery.serialize(mq)));
        return doc;
    }

    // For each query selected by the presearcher, pass on to a CandidateMatcher
    private static class StandardQueryCollector<T extends QueryMatch> implements QueryIndex.QueryCollector {

        final CandidateMatcher<T> matcher;
        int queryCount = 0;

        private StandardQueryCollector(CandidateMatcher<T> matcher) {
            this.matcher = matcher;
        }

        @Override
        public void matchQuery(String id, QueryCacheEntry query, QueryIndex.DataValues dataValues)
                throws IOException {
            if (query == null)
                return;
            try {
                queryCount++;
                matcher.matchQuery(id, query.matchQuery, query.metadata);
            } catch (Exception e) {
                matcher.reportError(new MatchError(id, e));
            }
        }
    }

    /**
     * Match a DocumentBatch against the queries stored in the Monitor, also returning information
     * about which queries were selected by the presearcher, and why.
     * @param docs a DocumentBatch to match against the index
     * @param factory a {@link MatcherFactory} to use to create a {@link CandidateMatcher} for the match run
     * @param <T> the type of QueryMatch produced by the CandidateMatcher
     * @return a {@link PresearcherMatches} object containing debug information
     * @throws IOException on IO errors
     */
    public <T extends QueryMatch> PresearcherMatches<T> debug(final DocumentBatch docs, MatcherFactory<T> factory)
            throws IOException {
        PresearcherQueryCollector<T> collector = new PresearcherQueryCollector<>(factory.createMatcher(docs));
        QueryIndex.QueryBuilder queryBuilder = new PresearcherQueryBuilder(docs.getIndexReader()) {
            @Override
            public Query buildQuery(QueryTermFilter termFilter) throws IOException {
                return new ForceNoBulkScoringQuery(SpanRewriter.INSTANCE.rewrite(super.buildQuery(termFilter)));
            }
        };
        queryIndex.search(queryBuilder, collector);
        return collector.getMatches();
    }

    /**
     * Match a single {@link InputDocument} against the queries stored in the Monitor, also returning information
     * about which queries were selected by the presearcher, and why.
     * @param doc an InputDocument to match against the index
     * @param factory a {@link MatcherFactory} to use to create a {@link CandidateMatcher} for the match run
     * @param <T> the type of QueryMatch produced by the CandidateMatcher
     * @return a {@link PresearcherMatches} object containing debug information
     * @throws IOException on IO errors
     */
    public <T extends QueryMatch> PresearcherMatches<T> debug(InputDocument doc, MatcherFactory<T> factory)
            throws IOException {
        return debug(DocumentBatch.of(doc), factory);
    }

    private class PresearcherQueryCollector<T extends QueryMatch> extends StandardQueryCollector<T> {

        public final Map<String, StringBuilder> matchingTerms = new HashMap<>();

        private PresearcherQueryCollector(CandidateMatcher<T> matcher) {
            super(matcher);
        }

        public PresearcherMatches<T> getMatches() {
            return new PresearcherMatches<>(matchingTerms, matcher.getMatches());
        }

        @Override
        public void matchQuery(final String id, QueryCacheEntry query, QueryIndex.DataValues dataValues)
                throws IOException {

            SpanCollector collector = new SpanCollector() {
                @Override
                public void collectLeaf(PostingsEnum postingsEnum, int position, Term term) throws IOException {
                    if (!matchingTerms.containsKey(id))
                        matchingTerms.put(id, new StringBuilder());
                    matchingTerms.get(id).append(" ").append(term.field()).append(":")
                            .append(term.bytes().utf8ToString());
                }

                @Override
                public void reset() {

                }
            };

            SpanExtractor.collect(dataValues.scorer, collector, false);

            super.matchQuery(id, query, dataValues);
        }

    }

}