de.unihildesheim.iw.lucene.document.FeedbackQuery.java Source code

Introduction

Here is the source code for de.unihildesheim.iw.lucene.document.FeedbackQuery.java
Source

/*
 * Copyright (C) 2014 Jens Bertram <code@jens-bertram.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package de.unihildesheim.iw.lucene.document;

import de.unihildesheim.iw.lucene.index.IndexDataProvider;
import de.unihildesheim.iw.lucene.query.RelaxableQuery;
import de.unihildesheim.iw.lucene.util.BitsUtils;
import de.unihildesheim.iw.lucene.util.DocIdSetUtils;
import de.unihildesheim.iw.lucene.util.StreamUtils;
import de.unihildesheim.iw.util.RandomValue;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RoaringDocIdSet.Builder;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * Utility class to get feedback documents based on queries. Feedback documents
 * are provided as list of document ids matching the given query.
 *
 * @author Jens Bertram
 */
public final class FeedbackQuery {

    /**
     * Logger instance for this class.
     */
    private static final org.slf4j.Logger LOG = LoggerFactory.getLogger(FeedbackQuery.class);

    /**
     * Private constructor for utility class.
     */
    private FeedbackQuery() {
        // empty private constructor for utility class
    }

    /**
     * Get the maximum number of documents that can be retrieved.
     *
     * @param reader Reader to access the index
     * @param docCount Number of documents that should be retrieved
     * @return Actual number of documents possible to retrieve
     */
    static int getMaxDocs(@NotNull final IndexReader reader, final int docCount) {
        final int maxRetDocs; // maximum number of documents that can be returned
        if (docCount == Integer.MAX_VALUE) {
            return reader.maxDoc();
        }
        final int maxIdxDocs = reader.maxDoc();
        if (docCount > maxIdxDocs) {
            maxRetDocs = Math.min(maxIdxDocs, docCount);
            LOG.warn("Requested number of feedback documents ({}) "
                    + "is larger than the amount of documents in the index ({}). "
                    + "Returning only {} feedback documents at maximum.", docCount, maxIdxDocs, maxRetDocs);
        } else {
            maxRetDocs = docCount;
        }
        return maxRetDocs;
    }

    /**
     * Get a limited number of feedback documents matching a query.
     *
     * @param searcher Searcher for issuing the query
     * @param query Query to get matching documents
     * @param maxDocCount Maximum number of documents to get. The number of
     * results may be lower, if there were less matching documents. If {@code -1}
     * is provided as value, then the maximum number of feedback documents is
     * unlimited. This means, all matching documents will be returned.
     * @return Documents matching the query
     * @throws IOException Thrown on low-level I/O errors
     */
    static int[] getDocs(@NotNull final IndexSearcher searcher, @NotNull final Query query, final int maxDocCount)
            throws IOException {
        final TopDocs results;
        final int fbDocCnt;
        if (maxDocCount == -1) {
            LOG.debug("Feedback doc count is unlimited. " + "Running pre query to get total hits.");
            final TotalHitCountCollector coll = new TotalHitCountCollector();
            searcher.search(query, coll);
            final int expResults = coll.getTotalHits();
            if (LOG.isDebugEnabled()) {
                LOG.debug("Running post query expecting {} results.", expResults);
            }
            results = searcher.search(query, expResults);
            fbDocCnt = results.totalHits;
            if (LOG.isDebugEnabled()) {
                LOG.debug("Post query returned {} feedback documents.", fbDocCnt);
            }
        } else {
            results = searcher.search(query, maxDocCount);
            fbDocCnt = Math.min(results.totalHits, maxDocCount);
        }

        if (LOG.isDebugEnabled()) {
            if (maxDocCount == -1) {
                //noinspection HardcodedFileSeparator
                LOG.debug("Getting {}/unlimited feedback documents.", fbDocCnt);
            } else {
                //noinspection HardcodedFileSeparator
                LOG.debug("Getting {}/{} feedback documents ({} requested).", fbDocCnt, results.totalHits,
                        maxDocCount);
            }
        }

        // extract document ids while keeping order
        @SuppressFBWarnings("SUA_SUSPICIOUS_UNINITIALIZED_ARRAY")
        final int[] docIds = new int[results.scoreDocs.length];
        // add the matching documents to the list
        IntStream.range(0, results.scoreDocs.length).forEach(i -> docIds[i] = results.scoreDocs[i].doc);

        return docIds;
    }

    /**
     * Tries to get the minimum number of document without {@link
     * RelaxableQuery#relax() relaxing} the query. If the minimum number of
     * documents is not reached without relaxing at most the maximum number of
     * documents is returned while relaxing the query.
     *
     * @param searcher Searcher to issue queries
     * @param query Relaxable query to get matching documents
     * @param minDocs Minimum number of documents to get. Must be greater than
     * zero.
     * @param maxDocCount Maximum number of documents to get. {@code -1} for
     * unlimited or greater than zero.
     * @return List of documents matching the (relaxed) query. Ranking order is
     * not preserved!
     * @throws IOException Thrown on low-level I/O errors
     */
    public static DocIdSet getMinMax(@NotNull final IndexSearcher searcher, @NotNull final RelaxableQuery query,
            final int minDocs, final int maxDocCount) throws IOException {
        final int maxDocs;

        if (maxDocCount == -1) {
            maxDocs = Integer.MAX_VALUE;
        } else if (maxDocCount < 0) {
            throw new IllegalArgumentException(
                    "Maximum number of documents must " + "be -1 (unlimited) or greater than zero.");
        } else if (maxDocCount < minDocs) {
            throw new IllegalArgumentException(
                    "Maximum number of documents must " + "be greater than minimum value.");
        } else {
            maxDocs = maxDocCount;
        }
        if (minDocs <= 0) {
            throw new IllegalArgumentException("Minimum number of documents must be" + " greater than zero.");
        }

        final int maxRetDocs = getMaxDocs(searcher.getIndexReader(), maxDocs);
        final FixedBitSet bits = new FixedBitSet(searcher.getIndexReader().maxDoc());
        bits.or(BitsUtils.arrayToBits(getDocs(searcher, query.getQueryObj(), maxRetDocs)));

        // build a log-info string
        final String logInfo = "Got {} matching feedback documents. " + "Relaxing query to "
                + (maxDocCount > 0 ? "get additional" : "reach the minimum of") + " {} feedback documents...";

        int docsToGet;
        int bitsCount;
        while ((bitsCount = bits.cardinality()) < minDocs && query.relax()) {
            docsToGet = maxRetDocs - bitsCount;
            LOG.info(logInfo, bitsCount, docsToGet);

            final int[] docs = getDocs(searcher, query.getQueryObj(), maxRetDocs);
            int maxAdd = maxDocs - bitsCount;

            for (int i = docs.length - 1; i >= 0 && maxAdd > 0; i--) {
                if (!bits.getAndSet(docs[i])) {
                    maxAdd--;
                }
            }
        }

        LOG.info("Returning {} documents.", bits.cardinality());
        return new BitDocIdSet(bits);
    }

    /**
     * Try to get a fixed number of feedback documents.
     *
     * @param searcher Searcher to search the Lucene index
     * @param dataProv DataProvider
     * @param query Relaxable query to get matching documents
     * @param docCount Number of documents to try to reach. Results may be lower,
     * if there are less matching documents in the index
     * @return List of documents matching the (relaxed) query. Ranking order is
     * not preserved!
     * @throws IOException Thrown on low-level I/O errors
     * @see #getFixed(IndexSearcher, IndexDataProvider, RelaxableQuery, int,
     * String...)
     */
    @SuppressWarnings("NullArgumentToVariableArgMethod")
    public static DocIdSet getFixed(@NotNull final IndexSearcher searcher,
            @NotNull final IndexDataProvider dataProv, @NotNull final RelaxableQuery query, final int docCount)
            throws IOException {
        return getFixed(searcher, dataProv, query, docCount, null);
    }

    /**
     * Try to get a fixed number of feedback documents. If the number of feedback
     * documents is not reached by running an initial query the query get relaxed
     * (simplified) to reach a higher number of matching feedback documents. If
     * the desired number of documents could not be reached by relaxing the query
     * all matching documents collected so far are returned. So the number of
     * returned documents may be lower than desired. <br> The documents in the
     * returned set are ordered by their relevance. Sorting is from best to worst
     * matching. Creates a new {@link IndexSearcher} from the provided {@link
     * IndexReader}.
     *
     * @param searcher Searcher to search the Lucene index
     * @param dataProv DataProvider
     * @param query Relaxable query to get matching documents
     * @param docCount Number of documents to try to reach. Results may be lower,
     * if there are less matching documents in the index
     * @param fields Fields that need to have content, if additional random
     * documents are required
     * @return List of documents matching the (relaxed) query. Ranking order is
     * not preserved!
     * @throws IOException Thrown on low-level I/O errors
     */
    public static DocIdSet getFixed(@NotNull final IndexSearcher searcher,
            @NotNull final IndexDataProvider dataProv, @NotNull final RelaxableQuery query, final int docCount,
            @Nullable final String... fields) throws IOException {
        DocIdSet docIds = getMinMax(searcher, query, docCount, docCount);
        if (DocIdSetUtils.cardinality(docIds) < docCount) {
            docIds = getRandom(dataProv, docCount, docIds, fields);
        }
        return docIds;
    }

    /**
     * Get random documents from the index.
     *
     * @param dataProv DataProvider to request documents from
     * @param amount Amount of documents to get at total
     * @param docIds Document-ids already collected. Size must be lower than
     * {@code amount}.
     * @return DocIdSet containing all ids from {@code docIds} and a number of
     * random chosen documents. The size will be {@code amount}, if enough
     * additional documents are present in the index. Otherwise it may be lower.
     * @throws IOException Thrown on low-level i/o-errors
     */
    @SuppressWarnings("NullArgumentToVariableArgMethod")
    public static DocIdSet getRandom(@NotNull final IndexDataProvider dataProv, final int amount,
            @NotNull final DocIdSet docIds) throws IOException {
        return getRandom(dataProv, amount, docIds, null);
    }

    /**
     * Get random documents from the index.
     *
     * @param dataProv DataProvider to request documents from
     * @param amount Amount of documents to get at total
     * @param docIds Document-ids already collected. Size must be lower than
     * {@code amount}.
     * @param fields Fields that need to have content
     * @return DocIdSet containing all ids from {@code docIds} and a number of
     * random chosen documents. The size will be {@code amount}, if enough
     * additional documents are present in the index. Otherwise it may be lower.
     * @throws IOException Thrown on low-level i/o-errors
     */
    public static DocIdSet getRandom(@NotNull final IndexDataProvider dataProv, final int amount,
            @NotNull final DocIdSet docIds, @Nullable final String... fields) throws IOException {
        final int[] results = new int[amount];
        Arrays.fill(results, -1);
        // collect all provided document-ids, to skip them while choosing random
        // ones
        final int[] docIdsArr = StreamUtils.stream(docIds).sorted().toArray();
        // count collected documents
        int currentAmount = docIdsArr.length;
        if (currentAmount > 0) {
            // copy already provided doc-ids to results
            System.arraycopy(docIdsArr, 0, results, 0, currentAmount);
        }

        LOG.info("Getting {}/{} random feedback documents. {} documents provided.", amount - currentAmount, amount,
                currentAmount);

        final List<Integer> haveDocs = dataProv.getDocumentIds()
                // skip ids already provided
                .filter(id -> Arrays.binarySearch(docIdsArr, id) < 0).boxed().collect(Collectors.toList());

        if (haveDocs.isEmpty()) {
            LOG.warn("Giving up. No random documents. Got {} documents.", currentAmount);
        } else {
            // add as many documents as possible, while no adding more than the
            // amount specified
            boolean hasTerms;
            for (int i = Math.min(amount - currentAmount, haveDocs.size()); i > 0; i--) {
                hasTerms = true;
                final int docId = haveDocs.remove(RandomValue.getInteger(0, haveDocs.size() - 1));
                if (fields != null && fields.length > 0) {
                    hasTerms = dataProv.getDocumentTerms(docId, fields).findFirst().isPresent();
                }
                if (hasTerms) {
                    results[currentAmount++] = docId;
                }
            }
            if (currentAmount < amount && haveDocs.isEmpty()) {
                LOG.warn("Giving up searching for random documents. Got {} documents.", currentAmount);
            }
        }
        Arrays.sort(results);
        final Builder result = new Builder(results[results.length - 1]);
        Arrays.stream(results).filter(id -> id >= 0).distinct().forEach(result::add);
        return result.build();
    }
}