org.apache.lucene.search.UsageTrackingQueryCachingPolicy.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.search.UsageTrackingQueryCachingPolicy.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;

import org.apache.lucene.util.FrequencyTrackingRingBuffer;

/**
 * A {@link QueryCachingPolicy} that tracks usage statistics of recently-used
 * filters in order to decide on which filters are worth caching.
 *
 * @lucene.experimental
 */
public class UsageTrackingQueryCachingPolicy implements QueryCachingPolicy {

    // the hash code that we use as a sentinel in the ring buffer.
    private static final int SENTINEL = Integer.MIN_VALUE;

    private static boolean isPointQuery(Query query) {
        // we need to check for super classes because we occasionally use anonymous
        // sub classes of eg. PointRangeQuery
        for (Class<?> clazz = query.getClass(); clazz != Query.class; clazz = clazz.getSuperclass()) {
            final String simpleName = clazz.getSimpleName();
            if (simpleName.startsWith("Point") && simpleName.endsWith("Query")) {
                return true;
            }
        }
        return false;
    }

    static boolean isCostly(Query query) {
        // This does not measure the cost of iterating over the filter (for this we
        // already have the DocIdSetIterator#cost API) but the cost to build the
        // DocIdSet in the first place
        return query instanceof MultiTermQuery || query instanceof MultiTermQueryConstantScoreWrapper
                || query instanceof TermInSetQuery || isPointQuery(query);
    }

    private static boolean shouldNeverCache(Query query) {
        if (query instanceof TermQuery) {
            // We do not bother caching term queries since they are already plenty fast.
            return true;
        }

        if (query instanceof MatchAllDocsQuery) {
            // MatchAllDocsQuery has an iterator that is faster than what a bit set could do.
            return true;
        }

        // For the below queries, it's cheap to notice they cannot match any docs so
        // we do not bother caching them.
        if (query instanceof MatchNoDocsQuery) {
            return true;
        }

        if (query instanceof BooleanQuery) {
            BooleanQuery bq = (BooleanQuery) query;
            if (bq.clauses().isEmpty()) {
                return true;
            }
        }

        if (query instanceof DisjunctionMaxQuery) {
            DisjunctionMaxQuery dmq = (DisjunctionMaxQuery) query;
            if (dmq.getDisjuncts().isEmpty()) {
                return true;
            }
        }

        return false;
    }

    private final FrequencyTrackingRingBuffer recentlyUsedFilters;

    /**
     * Expert: Create a new instance with a configurable history size. Beware of
     * passing too large values for the size of the history, either
     * {@link #minFrequencyToCache} returns low values and this means some filters
     * that are rarely used will be cached, which would hurt performance. Or
     * {@link #minFrequencyToCache} returns high values that are function of the
     * size of the history but then filters will be slow to make it to the cache.
     *
     * @param historySize               the number of recently used filters to track
     */
    public UsageTrackingQueryCachingPolicy(int historySize) {
        this.recentlyUsedFilters = new FrequencyTrackingRingBuffer(historySize, SENTINEL);
    }

    /** Create a new instance with an history size of 256. This should be a good
     *  default for most cases. */
    public UsageTrackingQueryCachingPolicy() {
        this(256);
    }

    /**
     * For a given filter, return how many times it should appear in the history
     * before being cached. The default implementation returns 2 for filters that
     * need to evaluate against the entire index to build a {@link DocIdSetIterator},
     * like {@link MultiTermQuery}, point-based queries or {@link TermInSetQuery},
     * and 5 for other filters.
     */
    protected int minFrequencyToCache(Query query) {
        if (isCostly(query)) {
            return 2;
        } else {
            // default: cache after the filter has been seen 5 times
            int minFrequency = 5;
            if (query instanceof BooleanQuery || query instanceof DisjunctionMaxQuery) {
                // Say you keep reusing a boolean query that looks like "A OR B" and
                // never use the A and B queries out of that context. 5 times after it
                // has been used, we would cache both A, B and A OR B, which is
                // wasteful. So instead we cache compound queries a bit earlier so that
                // we would only cache "A OR B" in that case.
                minFrequency--;
            }
            return minFrequency;
        }
    }

    @Override
    public void onUse(Query query) {
        assert query instanceof BoostQuery == false;
        assert query instanceof ConstantScoreQuery == false;

        if (shouldNeverCache(query)) {
            return;
        }

        // call hashCode outside of sync block
        // in case it's somewhat expensive:
        int hashCode = query.hashCode();

        // we only track hash codes to avoid holding references to possible
        // large queries; this may cause rare false positives, but at worse
        // this just means we cache a query that was not in fact used enough:
        synchronized (this) {
            recentlyUsedFilters.add(hashCode);
        }
    }

    int frequency(Query query) {
        assert query instanceof BoostQuery == false;
        assert query instanceof ConstantScoreQuery == false;

        // call hashCode outside of sync block
        // in case it's somewhat expensive:
        int hashCode = query.hashCode();

        synchronized (this) {
            return recentlyUsedFilters.frequency(hashCode);
        }
    }

    @Override
    public boolean shouldCache(Query query) throws IOException {
        if (shouldNeverCache(query)) {
            return false;
        }
        final int frequency = frequency(query);
        final int minFrequency = minFrequencyToCache(query);
        return frequency >= minFrequency;
    }

}