nl.inl.blacklab.search.lucene.SpansInBucketsAbstract.java Source code

Introduction

Here is the source code for nl.inl.blacklab.search.lucene.SpansInBucketsAbstract.java
Source

/*******************************************************************************
 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package nl.inl.blacklab.search.lucene;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.spans.Spans;
import nl.inl.blacklab.search.Hit;
import nl.inl.blacklab.search.Span;

/**
 * Wrap a Spans to retrieve sequences of certain matches (in "buckets"), so we can process the
 * sequence efficiently.
 *
 * Examples of sequences of hits might be:
 * * all hits in a document
 * * all consecutive hits in a document
 *
 * This way we can retrieve hits and perform some operation on them (like sorting or retrieving some
 * extra information).
 *
 * Note that with this class, "bucketing" is only possible with consecutive hits from the Spans
 * object. If you want other kinds of hit buckets (containing non-consecutive spans), you should
 * just implement the SpansInBuckets interface, not extend SpansInBucketsAbstract.
 *
 * Also, SpansInBuckets assumes all hits in a bucket are from a single document.
 *
 */
abstract class SpansInBucketsAbstract implements SpansInBuckets {
    protected BLSpans source;

    protected int currentDoc = -1;

    private List<Hit> bucket = new ArrayList<>();

    /**
     * For each hit we fetched, store the captured groups, so we don't
     * lose this information.
     */
    private Map<Hit, Span[]> capturedGroupsPerHit = new HashMap<>();

    /**
     * Size of the current bucket, or -1 if we're not at a valid bucket.
     */
    private int bucketSize = -1;

    private HitQueryContext hitQueryContext;

    /** Is there captured group information for each hit that we need to store? */
    private boolean doCapturedGroups;

    /** Does our clause capture any groups? If not, we don't need to mess with those */
    protected boolean clauseCapturesGroups = true;

    protected void addHitFromSource() {
        Hit hit = source.getHit();
        bucket.add(hit);
        if (doCapturedGroups) {
            // Store captured group information
            Span[] capturedGroups = new Span[hitQueryContext.numberOfCapturedGroups()];
            source.getCapturedGroups(capturedGroups);
            capturedGroupsPerHit.put(hit, capturedGroups);
        }
        bucketSize++;
    }

    protected void sortHits(Comparator<Hit> hitComparator) {
        Collections.sort(bucket, hitComparator);
    }

    @Override
    public int bucketSize() {
        return bucketSize;
    }

    @Override
    public int startPosition(int indexInBucket) {
        return bucket.get(indexInBucket).start;
    }

    @Override
    public int endPosition(int indexInBucket) {
        return bucket.get(indexInBucket).end;
    }

    @Override
    public Hit getHit(int indexInBucket) {
        return bucket.get(indexInBucket);
    }

    public SpansInBucketsAbstract(BLSpans source) {
        this.source = source;
    }

    @Override
    public int nextDoc() throws IOException {
        bucketSize = -1; // not at a valid bucket anymore
        if (currentDoc != DocIdSetIterator.NO_MORE_DOCS) {
            currentDoc = source.nextDoc();
            if (currentDoc != DocIdSetIterator.NO_MORE_DOCS) {
                source.nextStartPosition(); // start gathering at the first hit
                //gatherHitsInternal();
            }
        }
        return currentDoc;
    }

    @Override
    public int nextBucket() throws IOException {
        if (currentDoc < 0) {
            // Not nexted yet, no bucket
            return -1;
        }
        if (currentDoc == DocIdSetIterator.NO_MORE_DOCS || source.startPosition() == Spans.NO_MORE_POSITIONS)
            return NO_MORE_BUCKETS;
        return gatherHitsInternal();
    }

    /**
     * Subclasses should override this to gather the hits they wish to put in the next bucket.
     *
     * Upon entering this method, the source spans is at the last unused hit (or the first hit in a
     * new document). At the end, it should be at the first hit that doesn't fit in the bucket (or
     * beyond the last hit, i.e. Spans.NO_MORE_POSITIONS).
     *
     * @throws IOException
     */
    protected abstract void gatherHits() throws IOException;

    @Override
    public int advance(int target) throws IOException {
        bucketSize = -1; // not at a valid bucket anymore
        if (currentDoc != DocIdSetIterator.NO_MORE_DOCS) {
            if (currentDoc >= target)
                nextDoc();
            else {
                currentDoc = source.advance(target);
                if (currentDoc != DocIdSetIterator.NO_MORE_DOCS) {
                    source.nextStartPosition(); // start gathering at the first hit
                    //gatherHitsInternal();
                }
            }
        }
        return currentDoc;
    }

    private int gatherHitsInternal() throws IOException {
        // NOTE: we could call .clear() here, but we don't want to hold on to
        // a lot of memory indefinitely after encountering one huge bucket.
        if (bucket.size() < ARRAYLIST_REALLOC_THRESHOLD) {
            // Not a huge amount of memory, so don't reallocate
            bucket.clear();
            capturedGroupsPerHit.clear();
        } else {
            // Reallocate in this case to avoid holding on to a lot of memory
            bucket = new ArrayList<>();
            capturedGroupsPerHit = new HashMap<>();
        }

        bucketSize = 0;
        doCapturedGroups = clauseCapturesGroups && hitQueryContext != null
                && hitQueryContext.numberOfCapturedGroups() > 0;
        gatherHits();
        return currentDoc;
    }

    @Override
    public int docID() {
        return currentDoc;
    }

    @Override
    public String toString() {
        return source.toString();
    }

    @Override
    public void setHitQueryContext(HitQueryContext context) {
        this.hitQueryContext = context;
        int before = context.getCaptureRegisterNumber();
        source.setHitQueryContext(context);
        if (context.getCaptureRegisterNumber() == before) {
            // Our clause doesn't capture any groups; optimize
            clauseCapturesGroups = false;
        }
    }

    @Override
    public void getCapturedGroups(int indexInBucket, Span[] capturedGroups) {
        if (!doCapturedGroups)
            return;
        Span[] previouslyCapturedGroups = capturedGroupsPerHit.get(bucket.get(indexInBucket));
        if (previouslyCapturedGroups != null) {
            for (int i = 0; i < capturedGroups.length; i++) {
                if (previouslyCapturedGroups[i] != null)
                    capturedGroups[i] = previouslyCapturedGroups[i];
            }
        }
    }

}