org.sakaiproject.nakamura.search.solr.SparseResultSetFactory.java Source code

Introduction

Here is the source code for org.sakaiproject.nakamura.search.solr.SparseResultSetFactory.java
Source

/**
 * Licensed to the Sakai Foundation (SF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The SF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package org.sakaiproject.nakamura.search.solr;

import static org.sakaiproject.nakamura.api.search.solr.SolrSearchConstants.PARAMS_ITEMS_PER_PAGE;
import static org.sakaiproject.nakamura.api.search.solr.SolrSearchConstants.PARAMS_PAGE;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.util.Version;
import org.apache.sling.api.SlingHttpServletRequest;
import org.apache.sling.commons.osgi.PropertiesUtil;
import org.apache.solr.schema.TextField;
import org.sakaiproject.nakamura.api.lite.Session;
import org.sakaiproject.nakamura.api.lite.StorageClientException;
import org.sakaiproject.nakamura.api.lite.StorageClientUtils;
import org.sakaiproject.nakamura.api.lite.StorageConstants;
import org.sakaiproject.nakamura.api.lite.accesscontrol.AccessDeniedException;
import org.sakaiproject.nakamura.api.lite.content.Content;
import org.sakaiproject.nakamura.api.lite.content.ContentManager;
import org.sakaiproject.nakamura.api.search.solr.Query;
import org.sakaiproject.nakamura.api.search.solr.ResultSetFactory;
import org.sakaiproject.nakamura.api.search.solr.SolrSearchException;
import org.sakaiproject.nakamura.api.search.solr.SolrSearchResultSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;

/**
 *
 */
@Component(metatype = true)
@Service
@Property(name = "type", value = Query.SPARSE)
public class SparseResultSetFactory implements ResultSetFactory {
    @Property(longValue = 100L)
    private static final String VERY_SLOW_QUERY_TIME = "verySlowQueryTime";
    @Property(longValue = 10L)
    private static final String SLOW_QUERY_TIME = "slowQueryTime";
    @Property(intValue = 100)
    private static final String DEFAULT_MAX_RESULTS = "defaultMaxResults";

    private int defaultMaxResults = 100; // set to 100 to allow testing
    private long slowQueryThreshold;
    private long verySlowQueryThreshold;

    /** only used to mark the logger */
    private final class SlowQueryLogger {
    }

    private static final Logger SLOW_QUERY_LOGGER = LoggerFactory.getLogger(SlowQueryLogger.class);
    private static final Logger LOGGER = LoggerFactory.getLogger(SparseResultSetFactory.class);

    @Activate
    protected void activate(Map<?, ?> props) {
        defaultMaxResults = PropertiesUtil.toInteger(props.get(DEFAULT_MAX_RESULTS), defaultMaxResults);
        slowQueryThreshold = PropertiesUtil.toLong(props.get(SLOW_QUERY_TIME), 10L);
        verySlowQueryThreshold = PropertiesUtil.toLong(props.get(VERY_SLOW_QUERY_TIME), 100L);
    }

    /**
     * Process properties to query sparse content directly.
     *
     * @param request
     * @param query
     * @param asAnon
     * @return
     * @throws StorageClientException
     * @throws AccessDeniedException
     */
    public SolrSearchResultSet processQuery(SlingHttpServletRequest request, Query query, boolean asAnon)
            throws SolrSearchException {
        try {
            // use solr parsing to get the terms from the query string
            QueryParser parser = new QueryParser(Version.LUCENE_40, "id", new TextField().getQueryAnalyzer());
            org.apache.lucene.search.Query luceneQuery = parser.parse(query.getQueryString());

            Map<String, Object> props = Maps.newHashMap();
            if (luceneQuery instanceof BooleanQuery) {
                BooleanQuery boolLucQuery = (BooleanQuery) luceneQuery;

                int orCount = 0;
                List<BooleanClause> clauses = boolLucQuery.clauses();
                for (BooleanClause clause : clauses) {
                    org.apache.lucene.search.Query clauseQuery = clause.getQuery();
                    Map<String, Object> subOrs = Maps.newHashMap();
                    // we support 1 level of nesting for OR clauses
                    if (clauseQuery instanceof BooleanQuery) {
                        BooleanQuery bq = (BooleanQuery) clauseQuery;
                        List<BooleanClause> subClauses = bq.clauses();
                        for (BooleanClause subclause : subClauses) {
                            org.apache.lucene.search.Query subclauseQuery = subclause.getQuery();
                            extractTerms(subclause, subclauseQuery, props, subOrs);
                        }
                        props.put("orset" + orCount, subOrs);
                        orCount++;
                    } else {
                        extractTerms(clause, clauseQuery, props, subOrs);
                        if (!subOrs.isEmpty()) {
                            props.put("orset" + orCount, subOrs);
                            orCount++;
                        }
                    }
                }
            } else {
                extractTerms(null, luceneQuery, props, null);
            }

            // add the options to the parameters but prepend _ to avoid collision
            for (Entry<String, Object> option : query.getOptions().entrySet()) {
                props.put("_" + option.getKey(), option.getValue());
            }

            String name = query.getName();
            if (name != null) {
                props.put(StorageConstants.CUSTOM_STATEMENT_SET, name);
            }

            Session session = StorageClientUtils
                    .adaptToSession(request.getResourceResolver().adaptTo(javax.jcr.Session.class));
            ContentManager cm = session.getContentManager();
            long tquery = System.currentTimeMillis();
            Iterable<Content> items = cm.find(props);
            tquery = System.currentTimeMillis() - tquery;
            try {
                if (tquery > verySlowQueryThreshold) {
                    SLOW_QUERY_LOGGER.error("Very slow sparse query {} ms {} ", tquery,
                            URLDecoder.decode(query.toString(), "UTF-8"));
                } else if (tquery > slowQueryThreshold) {
                    SLOW_QUERY_LOGGER.warn("Slow sparse query {} ms {} ", tquery,
                            URLDecoder.decode(query.toString(), "UTF-8"));
                }
            } catch (UnsupportedEncodingException e) {
                // quietly swallow this exception
                LOGGER.debug(e.getLocalizedMessage(), e);
            }
            SolrSearchResultSet rs = new SparseSearchResultSet(items, defaultMaxResults);
            return getResultSetWithCount(rs, props, cm);
        } catch (AccessDeniedException e) {
            throw new SolrSearchException(500, e.getMessage());
        } catch (StorageClientException e) {
            throw new SolrSearchException(500, e.getMessage());
        } catch (ParseException e) {
            throw new SolrSearchException(500, e.getMessage());
        }
    }

    /**
     * A standard paged OAE search returns a "total" value in the response. This may not
     * be an exact count of all potential matches, but should be enough to let
     * client-side code display a reasonable paging UX.  With a Solr search, "total" is
     * derived from SolrDocumentList's getNumFound(). But a Sparse query
     * result provides no equivalent functionality, and Sparse currently doesn't support
     * SQL-style "count" queries. As a result, the only way to find out if there are
     * more matches available is to do more searching and iterating.
     *
     * What's supplied here is a very rough negotiation between (A) leaving the client-side
     * completely in the dark and (B) retrieving every single match in the DB on every
     * single query.
     * <ul>
     * <li>If the current page's search returned more than zero results but fewer than the page size,
     * then this is the last page available. Estimate "total" as the current page offset plus
     * the current number of results.
     * <li>If the current page's search returned zero results, then we assume that any earlier pages
     * were full, and we estimate "total" as the current page offset. This may be wildly off, but it
     * should help keep client-side paging from vanishing unexpectedly.
     * <li>If the current page's search returned a full page of results, then a second query is needed
     * to hint at the remaining count. We try to retrieve a maximum-page-size's worth of results
     * as a compromise between speed and accuracy.
     * </ul>
     *
     * @param queryResultSet
     * @param props
     * @param cm
     * @return
     * @throws StorageClientException
     * @throws AccessDeniedException
     */
    private SolrSearchResultSet getResultSetWithCount(SolrSearchResultSet queryResultSet, Map<String, Object> props,
            ContentManager cm) throws StorageClientException, AccessDeniedException {
        final SolrSearchResultSet finalResultSet;
        final long queryCount = queryResultSet.getSize();

        if (queryCount < 0) {
            // Negative sizes signal "more than can be counted," and require no further
            // tinkering.
            finalResultSet = queryResultSet;
        } else {
            // Solr search results include both the desired page of results and a fuller count
            // of matches. Sparse queries only return the requested page of results, with no
            // other information.
            final long count;
            long nitems = Long.valueOf(String.valueOf(props.get("_" + PARAMS_ITEMS_PER_PAGE)));
            long page = Long.valueOf(String.valueOf(props.get("_" + PARAMS_PAGE)));
            long offset = page * nitems;
            if (queryCount == 0) {
                if (page > 0) {
                    // If the current page results are empty, that says nothing about whether
                    // earlier pages would have been full. It's possible that there are no matches
                    // at all, in which case the reported count will be even more misleading than usual.
                    try {
                        LOGGER.info("Empty results from paged sparse query {}",
                                URLDecoder.decode(props.toString(), "UTF-8"));
                    } catch (UnsupportedEncodingException e) {
                        LOGGER.debug(e.getLocalizedMessage(), e);
                    }
                }
                count = offset;
            } else {
                // Currently the only way to get a count of Sparse matches outside the specified
                // page range is to perform a larger paged query. Sparse query restrictions
                // mean that a very inaccurate count is still very likely (as compared to
                // a "count()" query in SQL).
                if (queryCount == nitems) {
                    long nextOffset = offset + nitems;
                    long countStartPage = nextOffset / defaultMaxResults;
                    long countOffset = countStartPage * defaultMaxResults;
                    props.put("_" + PARAMS_PAGE, Long.toString(countStartPage));
                    props.put("_" + PARAMS_ITEMS_PER_PAGE, Integer.toString(defaultMaxResults));
                    long tquery = System.currentTimeMillis();
                    Iterable<Content> countItems = cm.find(props);
                    tquery = System.currentTimeMillis() - tquery;
                    try {
                        if (tquery > verySlowQueryThreshold) {
                            SLOW_QUERY_LOGGER.error("Very slow count retrieval from sparse query {} ms {} ", tquery,
                                    URLDecoder.decode(props.toString(), "UTF-8"));
                        } else if (tquery > slowQueryThreshold) {
                            SLOW_QUERY_LOGGER.warn("Slow count retrieval from sparse query {} ms {} ", tquery,
                                    URLDecoder.decode(props.toString(), "UTF-8"));
                        }
                    } catch (UnsupportedEncodingException e) {
                        LOGGER.debug(e.getLocalizedMessage(), e);
                    }
                    long additionalCount = 0;
                    final Iterator<Content> countIterator = countItems.iterator();
                    while (countIterator.hasNext()) {
                        countIterator.next();
                        additionalCount++;
                    }
                    count = countOffset + additionalCount;
                } else {
                    count = offset + queryCount;
                }
            }
            finalResultSet = new SearchResultSetSizeWrapper(queryResultSet, count);
        }
        return finalResultSet;
    }

    /**
     * @param clause
     * @param clauseQuery
     * @param ands
     * @param ors
     */
    private void extractTerms(BooleanClause clause, org.apache.lucene.search.Query clauseQuery,
            Map<String, Object> ands, Map<String, Object> ors) {
        Set<Term> terms = Sets.newHashSet();
        clauseQuery.extractTerms(terms);

        for (Term term : terms) {
            if (clause != null && clause.getOccur() == Occur.SHOULD) {
                accumulateValue(ors, term.field(), term.text());
            } else {
                accumulateValue(ands, term.field(), term.text());
            }
        }
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private void accumulateValue(Map<String, Object> map, String key, Object val) {
        Object o = map.get(key);
        if (o != null) {
            if (o instanceof Collection) {
                ((Collection) o).add(val);
            } else {
                List<Object> os = Lists.newArrayList(o, val);
                map.put(key, os);
            }
        } else {
            map.put(key, val);
        }
    }
}