nl.inl.blacklab.search.lucene.SpansNGrams.java Source code

Introduction

Here is the source code for nl.inl.blacklab.search.lucene.SpansNGrams.java
Source

/*******************************************************************************
 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package nl.inl.blacklab.search.lucene;

import java.io.IOException;

import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.Bits;

import nl.inl.blacklab.search.Span;

/**
 * Return all n-grams of certain lengths.
 */
class SpansNGrams extends BLSpans {
    /** Current document */
    private int currentDoc = -1;

    /** Current document length */
    private long currentDocLength = -1;

    /** Current hit start position */
    private int currentStart = -1;

    /** Current hit end position */
    private int currentEnd = -1;

    /** For testing, we don't have an IndexReader available, so we use test values */
    private boolean useTestValues = false;

    /** Used to get the field length in tokens for a document */
    DocFieldLengthGetter lengthGetter;

    /** How much to subtract from length (for ignoring closing token) */
    private int subtractFromLength;

    /** Highest document id plus one */
    private int maxDoc;

    /** Documents that haven't been deleted */
    private Bits liveDocs;

    private boolean alreadyAtFirstMatch = false;

    private int min;

    private int max;

    /** For testing, we don't have an IndexReader available, so we use test values.
     *
     *  The test values are: there are 3 documents (0, 1 and 2) and each is 5 tokens long.
     *
     * @param test whether or not we want to use test values
     * @param maxDoc number of docs in the (mock) test set
     */
    void setTest(boolean test, int maxDoc) {
        useTestValues = test;
        if (useTestValues)
            this.maxDoc = maxDoc;
        lengthGetter.setTest(test);
    }

    /**
     * Constructs a SpansNGrams
     * @param ignoreLastToken if true, we assume the last token is always a special closing token and ignore it
     * @param reader the index reader, for getting field lengths
     * @param fieldName the field name, for getting field lengths
     * @param min minimum n-gram length
     * @param max maximum n-gram length
     */
    public SpansNGrams(boolean ignoreLastToken, LeafReader reader, String fieldName, int min, int max) {
        maxDoc = reader == null ? -1 : reader.maxDoc();
        liveDocs = reader == null ? null : MultiFields.getLiveDocs(reader);
        subtractFromLength = ignoreLastToken ? 1 : 0;
        this.lengthGetter = new DocFieldLengthGetter(reader, fieldName);
        this.min = min;
        this.max = max;
    }

    /**
     * @return the Lucene document id of the current hit
     */
    @Override
    public int docID() {
        return currentDoc;
    }

    /**
     * @return end position of current hit
     */
    @Override
    public int endPosition() {
        if (alreadyAtFirstMatch)
            return -1; // .nextStartPosition() not called yet by client
        return currentEnd;
    }

    @Override
    public int nextDoc() throws IOException {
        alreadyAtFirstMatch = false;
        do {
            if (currentDoc >= maxDoc) {
                currentDoc = NO_MORE_DOCS;
                currentStart = currentEnd = NO_MORE_POSITIONS;
                return NO_MORE_DOCS;
            }
            boolean currentDocIsDeletedDoc;
            do {
                currentDoc++;
                currentDocIsDeletedDoc = liveDocs != null && !liveDocs.get(currentDoc);
            } while (currentDoc < maxDoc && currentDocIsDeletedDoc);
            if (currentDoc > maxDoc)
                throw new RuntimeException("currentDoc > maxDoc!!");
            if (currentDoc == maxDoc) {
                currentDoc = NO_MORE_DOCS;
                currentStart = currentEnd = NO_MORE_POSITIONS;
                return NO_MORE_DOCS; // no more docs; we're done
            }
            currentDocLength = lengthGetter.getFieldLength(currentDoc) - subtractFromLength;
            currentStart = currentEnd = -1;
        } while (nextStartPosition() == NO_MORE_POSITIONS);
        alreadyAtFirstMatch = true;

        return currentDoc;
    }

    /**
     * Go to next span.
     *
     * @return true if we're at the next span, false if we're done
     * @throws IOException
     */
    @Override
    public int nextStartPosition() throws IOException {
        if (alreadyAtFirstMatch) {
            alreadyAtFirstMatch = false;
            return currentStart;
        }

        if (currentDoc == NO_MORE_DOCS || currentStart == NO_MORE_POSITIONS) {
            return NO_MORE_POSITIONS;
        }

        if (currentDoc < 0)
            return -1; // haven't started

        // Next N-gram
        if (currentStart < 0 || currentEnd - currentStart >= max || currentEnd >= currentDocLength) {
            currentStart++;
            currentEnd = currentStart + min;
            if (currentEnd > currentDocLength) {
                currentStart = currentEnd = NO_MORE_POSITIONS;
                return NO_MORE_POSITIONS;
            }
        } else {
            currentEnd++;
        }
        return currentStart;
    }

    @Override
    public int advanceStartPosition(int target) throws IOException {
        if (alreadyAtFirstMatch) {
            alreadyAtFirstMatch = false;
            if (currentStart >= target)
                return currentStart;
        }
        if (target >= currentDocLength) {
            currentStart = currentEnd = NO_MORE_POSITIONS;
            return NO_MORE_POSITIONS;
        }
        // Advance us to just before the requested start point, then call nextStartPosition().
        currentStart = target - 1;
        currentEnd = currentStart + max;
        return nextStartPosition();
    }

    /**
     * Skip to the specified document (or the first document after it containing hits).
     *
     * @param doc
     *            the doc number to skip to (or past)
     * @return true if we're still pointing to a valid hit, false if we're done
     * @throws IOException
     */
    @Override
    public int advance(int doc) throws IOException {
        alreadyAtFirstMatch = false;
        if (currentDoc == NO_MORE_DOCS)
            return NO_MORE_DOCS;
        if (doc >= maxDoc) {
            currentDoc = NO_MORE_DOCS;
            currentStart = currentEnd = NO_MORE_POSITIONS;
            return NO_MORE_DOCS;
        }

        if (currentDoc >= doc) {
            // We can't skip to it because we're already there or beyond.
            // But, as per spec, advance always at least advances to the next document.
            return nextDoc();
        }

        // Advance to first livedoc containing matches at or after requested docID
        currentDoc = doc - 1;
        nextDoc();
        return currentDoc;
    }

    /**
     * @return start of current span
     */
    @Override
    public int startPosition() {
        if (alreadyAtFirstMatch)
            return -1; // .nextStartPosition() not called yet by client
        return currentStart;
    }

    @Override
    public String toString() {
        return "AnyToken()";
    }

    @Override
    public boolean hitsEndPointSorted() {
        return min == max;
    }

    @Override
    public boolean hitsStartPointSorted() {
        return true;
    }

    @Override
    public boolean hitsAllSameLength() {
        return min == max;
    }

    @Override
    public int hitsLength() {
        return min;
    }

    @Override
    public boolean hitsHaveUniqueStart() {
        return min == max;
    }

    @Override
    public boolean hitsHaveUniqueEnd() {
        return min == max;
    }

    @Override
    public boolean hitsAreUnique() {
        return true;
    }

    @Override
    protected void passHitQueryContextToClauses(HitQueryContext context) {
        // no clause, nothing to do
    }

    @Override
    public void getCapturedGroups(Span[] capturedGroups) {
        // no clause, no groups
    }

}