nl.inl.blacklab.perdocument.DocResults.java Source code

Introduction

Here is the source code for nl.inl.blacklab.perdocument.DocResults.java
Source

/*******************************************************************************
 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package nl.inl.blacklab.perdocument;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

import nl.inl.blacklab.search.Hit;
import nl.inl.blacklab.search.Hits;
import nl.inl.blacklab.search.Searcher;
import nl.inl.blacklab.search.grouping.HitPropValueInt;
import nl.inl.util.ReverseComparator;
import nl.inl.util.ThreadPriority.Level;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.spans.SpanQuery;

/**
 * A list of DocResult objects (document-level query results). The list may be sorted by calling
 * DocResults.sort().
 */
public class DocResults implements Iterable<DocResult> {
    /**
     * (Part of) our document results
     */
    protected List<DocResult> results = new ArrayList<DocResult>();

    /**
     * Our searcher object
     */
    Searcher searcher;

    /**
     * Our source hits object
     */
    private Hits sourceHits;

    /**
     * Iterator in our source hits object
     */
    private Iterator<Hit> sourceHitsIterator;

    /**
     * A partial list of hits in a doc, because we stopped iterating through the Hits.
     * (or null if we don't have partial doc hits)
     * Pick this up when we continue iterating through it.
     */
    private List<Hit> partialDocHits = null;

    /** id of the partial doc we've done (because we stopped iterating through the Hits),
     * or -1 for no partial doc.
     */
    private int partialDocId = -1;

    public Searcher getSearcher() {
        return searcher;
    }

    /**
     * @param r
     * @deprecated use constructor that takes a list of results instead
     */
    @Deprecated
    public void add(DocResult r) {
        try {
            ensureAllResultsRead();
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // and let the caller detect and deal with the interruption.
            return;
        }
        results.add(r);
    }

    boolean sourceHitsFullyRead() {
        if (sourceHits == null)
            return true;
        synchronized (sourceHitsIterator) {
            return !sourceHitsIterator.hasNext();
        }
    }

    /**
     * Construct per-document results objects from a Hits object
     * @param searcher search object
     * @param hits the hits to view per-document
     * @deprecated use Hits.perDocResults()
     */
    @Deprecated
    public DocResults(Searcher searcher, Hits hits) {
        this.searcher = searcher;
        try {
            sourceHits = hits;
            sourceHitsIterator = hits.iterator();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     *
     * @param searcher
     * @param field
     * @param query
     * @deprecated use Hits.perDocResults()
     */
    @Deprecated
    public DocResults(Searcher searcher, String field, SpanQuery query) {
        this(searcher, new Hits(searcher, field, query));
    }

    /**
     * Wraps a list of DocResult objects with the DocResults interface.
     *
     * NOTE: the list is not copied but referenced!
     *
     * Used by DocGroups constructor.
     *
     * @param searcher the searcher that generated the results
     * @param results the list of results
     */
    DocResults(Searcher searcher, List<DocResult> results) {
        this.searcher = searcher;
        this.results = results;
    }

    /**
     * Construct DocResults from a Scorer (Lucene document results).
     *
     * @param searcher the searcher that generated the results
     * @param scorer the scorer to read document results from
     */
    DocResults(Searcher searcher, Scorer scorer) {
        this.searcher = searcher;
        if (scorer == null)
            return; // no matches, empty result set
        try {
            IndexReader indexReader = searcher.getIndexReader();
            while (true) {
                int docId;
                try {
                    docId = scorer.nextDoc();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                if (docId == DocIdSetIterator.NO_MORE_DOCS)
                    break;

                Document d = indexReader.document(docId);
                DocResult dr = new DocResult(searcher, null, docId, d, scorer.score());
                results.add(dr);
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Find documents whose metadata matches the specified query
     * @param searcher searcher object
     * @param query metadata query, or null to match all documents
     * @deprecated use Searcher.queryDocuments(); this constructor will be made package-private eventually
     */
    @Deprecated
    public DocResults(Searcher searcher, Query query) {

        this.searcher = searcher;

        // FIXME: a better approach is to only read documents we're actually interested in instead of all of them; compare with Hits.
        //    even better: make DocResults abstract and provide two implementations, DocResultsFromHits and DocResultsFromQuery.
        //      IndexSearcher indexSearcher = searcher.getIndexSearcher();
        //      IndexReader reader = indexSearcher.getIndexReader();
        //      Weight weight = indexSearcher.createNormalizedWeight(query);
        //      Map<String, Integer> freq = new HashMap<String, Integer>();
        //      for (AtomicReaderContext arc: reader.leaves()) {
        //         Scorer scorer = weight.scorer(arc, true, false, arc.reader().getLiveDocs());
        //         while (scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        //            //ddd
        //         }
        //      }

        searcher.collectDocuments(query, new Collector() {
            AtomicReaderContext reader = null;

            @Override
            public boolean acceptsDocsOutOfOrder() {
                return true;
            }

            @Override
            public void collect(int docId) throws IOException {
                int globalDocId = docId + reader.docBase;
                results.add(new DocResult(DocResults.this.searcher, null, globalDocId,
                        reader.reader().document(docId)));
            }

            @Override
            public void setNextReader(AtomicReaderContext reader) throws IOException {
                this.reader = reader;
            }

            @Override
            public void setScorer(Scorer scorer) throws IOException {
                // (ignore)
            }
        });

        //this(searcher, searcher.findDocScores(query == null ? new MatchAllDocsQuery(): query));
    }

    DocResults(Searcher searcher) {
        this.searcher = searcher;
    }

    /**
     * Get the list of results
     * @return the list of results
     * @deprecated Breaks optimizations. Use iterator or subList() instead.
     */
    @Deprecated
    public List<DocResult> getResults() {
        try {
            ensureAllResultsRead();
        } catch (InterruptedException e) {
            // Thread was interrupted; don't complete the operation but return
            // the results we have.
            // Let caller detect and deal with interruption.
        }
        return results;
    }

    /**
     * Sort the results using the given comparator.
     *
     * @param comparator
     *            how to sort the results
     */
    void sort(Comparator<DocResult> comparator) {
        try {
            ensureAllResultsRead();
        } catch (InterruptedException e) {
            // Thread was interrupted; just sort the results we have.
            // Let caller detect and deal with interruption.
        }
        Collections.sort(results, comparator);
    }

    /**
     * Determines if there are at least a certain number of results
     *
     * This may be used if we don't want to process all results (which
     * may be a lot) but we do need to know something about the size
     * of the result set (such as for paging).
     *
     * @param lowerBound the number we're testing against
     *
     * @return true if the size of this set is at least lowerBound, false otherwise.
     */
    public boolean sizeAtLeast(int lowerBound) {
        try {
            // Try to fetch at least this many hits
            ensureResultsRead(lowerBound);
        } catch (InterruptedException e) {
            // Thread was interrupted; abort operation
            // and let client decide what to do
        }

        return results.size() >= lowerBound;
    }

    /**
     * Get the number of documents in this results set.
     *
     * Note that this returns the number of document results available;
     * if there were so many hits that not all were retrieved (call
     * maxHitsRetrieved()), you can find the grand total of documents
     * by calling totalSize().
     *
     * @return the number of documents.
     */
    public int size() {
        // Make sure we've collected all results and return the size of our result list.
        try {
            ensureAllResultsRead();
        } catch (InterruptedException e) {
            // Thread was interrupted; return size of the results we have.
            // Let caller detect and deal with interruption.
        }
        return results.size();
    }

    /**
     * Get the total number of documents.
     * This even counts documents that weren't retrieved because the
     * set of hits was too large.
     *
     * @return the total number of documents.
     */
    public int totalSize() {
        if (sourceHits == null)
            return size(); // no hits, just documents
        return sourceHits.totalNumberOfDocs();
    }

    /**
     * Sort documents based on a document property.
     * @param prop the property to sort on
     * @param sortReverse true iff we want to sort in reverse.
     */
    public void sort(DocProperty prop, boolean sortReverse) {
        Comparator<DocResult> comparator = new ComparatorDocProperty(prop);
        if (sortReverse) {
            comparator = new ReverseComparator<DocResult>(comparator);
        }
        sort(comparator);
    }

    /**
     * Retrieve a sublist of hits.
     * @param fromIndex first hit to include in the resulting list
     * @param toIndex first hit not to include in the resulting list
     * @return the sublist
     */
    public List<DocResult> subList(int fromIndex, int toIndex) {
        try {
            ensureResultsRead(toIndex - 1);
        } catch (InterruptedException e) {
            // Thread was interrupted. We may not even have read
            // the first result in the sublist, so just return an empty list.
            return Collections.emptyList();
        }
        return results.subList(fromIndex, toIndex);
    }

    /**
     * If we still have only partially read our Hits object,
     * read the rest of it and add all the hits.
     * @throws InterruptedException
     */
    private void ensureAllResultsRead() throws InterruptedException {
        ensureResultsRead(-1);
    }

    /**
     * If we still have only partially read our Hits object,
     * read some more of it and add the hits.
     *
     * @param index the number of results we want to ensure have been read, or negative for all results
     * @throws InterruptedException
     */
    synchronized void ensureResultsRead(int index) throws InterruptedException {
        if (sourceHitsFullyRead())
            return;

        try {
            synchronized (sourceHitsIterator) {
                // Fill list of document results
                int doc = partialDocId;
                List<Hit> docHits = partialDocHits;
                partialDocId = -1;
                partialDocHits = null;

                IndexReader indexReader = searcher == null ? null : searcher.getIndexReader();
                //Thread currentThread = Thread.currentThread();
                while ((index < 0 || results.size() <= index) && sourceHitsIterator.hasNext()) {

                    Hit hit = sourceHitsIterator.next();
                    if (hit.doc != doc) {
                        if (docHits != null) {
                            Hits hits = new Hits(searcher, docHits);
                            hits.copySettingsFrom(sourceHits); // concordance type, etc.
                            addDocResultToList(doc, hits, indexReader);
                        }
                        doc = hit.doc;
                        docHits = new ArrayList<Hit>();
                    }
                    docHits.add(hit);
                }
                // add the final dr instance to the results collection
                if (docHits != null) {
                    if (sourceHitsIterator.hasNext()) {
                        partialDocId = doc;
                        partialDocHits = docHits; // not done, continue from here later
                    } else {
                        Hits hits = new Hits(searcher, docHits);
                        hits.copySettingsFrom(sourceHits); // concordance type, etc.
                        addDocResultToList(doc, hits, indexReader);
                    }
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void addDocResultToList(int doc, Hits docHits, IndexReader indexReader) throws IOException {
        DocResult docResult = new DocResult(searcher, sourceHits.getConcordanceFieldName(), doc,
                indexReader == null ? null : indexReader.document(doc), docHits);
        // Make sure we remember what kind of context we have, if any
        docResult.setContextField(sourceHits.getContextFieldPropName());
        results.add(docResult);
    }

    /**
     * Were all hits retrieved, or did we stop because there were too many?
     * @return true if all hits were retrieved
     * @deprecated renamed to maxHitsRetrieved()
     */
    @Deprecated
    public boolean tooManyHits() {
        return maxHitsRetrieved();
    }

    /**
     * Did we stop retrieving hits because we reached the maximum?
     * @return true if we reached the maximum and stopped retrieving hits
     */
    public boolean maxHitsRetrieved() {
        if (sourceHits == null)
            return false; // no hits, only docs
        return sourceHits.maxHitsRetrieved();
    }

    /**
     * Did we stop counting hits because we reached the maximum?
     * @return true if we reached the maximum and stopped counting hits
     */
    public boolean maxHitsCounted() {
        if (sourceHits == null)
            return false; // no hits, only docs
        return sourceHits.maxHitsCounted();
    }

    /**
     * Return an iterator over these hits.
     *
     * @return the iterator
     */
    @Override
    public Iterator<DocResult> iterator() {
        // Construct a custom iterator that iterates over the hits in the hits
        // list, but can also take into account the Spans object that may not have
        // been fully read. This ensures we don't instantiate Hit objects for all hits
        // if we just want to display the first few.
        return new Iterator<DocResult>() {

            int index = -1;

            @Override
            public boolean hasNext() {
                // Do we still have hits in the hits list?
                try {
                    ensureResultsRead(index + 1);
                } catch (InterruptedException e) {
                    // Thread was interrupted. Act like we're done.
                    // Let caller detect and deal with interruption.
                    return false;
                }
                return index + 1 < results.size();
            }

            @Override
            public DocResult next() {
                // Check if there is a next, taking unread hits from Spans into account
                if (hasNext()) {
                    index++;
                    return results.get(index);
                }
                throw new NoSuchElementException();
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }

        };
    }

    public DocResult get(int i) {
        try {
            ensureResultsRead(i);
        } catch (InterruptedException e) {
            // Thread was interrupted. Required hit hasn't been gathered;
            // we will just return null.
        }
        if (i >= results.size())
            return null;
        return results.get(i);
    }

    /**
     * Group these results by the specified document property
     * @param docProp the document property to group on (i.e. number of hits in doc, value of metadata field, etc.)
     * @return the grouped results
     */
    @SuppressWarnings("deprecation") // DocGroups constructor will be made package private eventually
    public DocGroups groupedBy(DocProperty docProp) {
        return new DocGroups(this, docProp);
    }

    /**
     * Get a window into the doc results
     * @param first first document result to include
     * @param number maximum number of document results to include
     * @return the window
     */
    @SuppressWarnings("deprecation") // DocResultsWindow constructor will be made package private eventually
    public DocResultsWindow window(int first, int number) {
        return new DocResultsWindow(this, first, number);
    }

    public Hits getOriginalHits() {
        return sourceHits;
    }

    /**
     * Count the number of results that have the same value for the specified
     * property. Basically a grouping operation without storing the results.
     * Used for e.g. faceted search.
     *
     * @param countBy property to count
     * @return the counts
     */
    public DocCounts countBy(DocProperty countBy) {
        return new DocCounts(this, countBy);
    }

    /**
     * Sum a property for all the documents.
     *
     * Can be used to calculate the total number of tokens in a subcorpus, for example.
     * Note that this does retrieve all results, so it may be slow for large sets.
     * In particular, you should try to call this method only for DocResults created with
     * Searcher.queryDocuments() (and not ones created with Hits.perDocResults()) to avoid
     * the overhead of fetching hits.
     *
     * @param numProp a numeric property to sum
     * @return the sum
     */
    public int intSum(DocProperty numProp) {
        try {
            ensureAllResultsRead();
        } catch (InterruptedException e) {
            // Thread was interrupted; just process the results we have.
            // Let caller detect and deal with interruption.
        }
        int sum = 0;
        for (DocResult result : results) {
            sum += ((HitPropValueInt) numProp.get(result)).getValue();
        }
        return sum;
    }

    public void setPriorityLevel(Level level) {
        if (sourceHits != null) {
            sourceHits.setPriorityLevel(level);
        }
    }

    public Level getPriorityLevel() {
        return sourceHits.getPriorityLevel();
    }

    public int countSoFarDocsCounted() {
        return sourceHits == null ? results.size() : sourceHits.countSoFarDocsCounted();
    }

    public int countSoFarDocsRetrieved() {
        return sourceHits == null ? results.size() : sourceHits.countSoFarDocsRetrieved();
    }
}