org.apache.lucene.analysis.ngram.NGramTokenizer.java Source code

Introduction

Here is the source code for org.apache.lucene.analysis.ngram.NGramTokenizer.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ngram;

import java.io.IOException;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;

/**
 * Tokenizes the input into n-grams of the given size(s).
 * <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
 * that characters between startOffset and endOffset in the original stream are
 * the same as the term chars.
 * <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
 * <table summary="ngram tokens example">
 * <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
 * <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
 * <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
 * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
 * </table>
 * <a name="version"></a>
 * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
 * <li>tokenize in a streaming fashion to support streams which are larger
 * than 1024 chars (limit of the previous version),
 * <li>count grams based on unicode code points instead of java chars (and
 * never split in the middle of surrogate pairs),
 * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
 * before computing n-grams.</ul>
 * <p>Additionally, this class doesn't trim trailing whitespaces and emits
 * tokens in a different order, tokens are now emitted by increasing start
 * offsets while they used to be emitted by increasing lengths (which prevented
 * from supporting large input streams).
 */
// non-final to allow for overriding isTokenChar, but all other methods should be final
public class NGramTokenizer extends Tokenizer {
    public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
    public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

    private CharacterUtils.CharacterBuffer charBuffer;
    private int[] buffer; // like charBuffer, but converted to code points
    private int bufferStart, bufferEnd; // remaining slice in buffer
    private int offset;
    private int gramSize;
    private int minGram, maxGram;
    private boolean exhausted;
    private int lastCheckedChar; // last offset in the buffer that we checked
    private int lastNonTokenChar; // last offset that we found to not be a token char
    private boolean edgesOnly; // leading edges n-grams only

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) {
        init(minGram, maxGram, edgesOnly);
    }

    /**
     * Creates NGramTokenizer with given min and max n-grams.
     * @param minGram the smallest n-gram to generate
     * @param maxGram the largest n-gram to generate
     */
    public NGramTokenizer(int minGram, int maxGram) {
        this(minGram, maxGram, false);
    }

    NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
        super(factory);
        init(minGram, maxGram, edgesOnly);
    }

    /**
     * Creates NGramTokenizer with given min and max n-grams.
     * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
     * @param minGram the smallest n-gram to generate
     * @param maxGram the largest n-gram to generate
     */
    public NGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
        this(factory, minGram, maxGram, false);
    }

    /**
     * Creates NGramTokenizer with default min and max n-grams.
     */
    public NGramTokenizer() {
        this(DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
    }

    private void init(int minGram, int maxGram, boolean edgesOnly) {
        if (minGram < 1) {
            throw new IllegalArgumentException("minGram must be greater than zero");
        }
        if (minGram > maxGram) {
            throw new IllegalArgumentException("minGram must not be greater than maxGram");
        }
        this.minGram = minGram;
        this.maxGram = maxGram;
        this.edgesOnly = edgesOnly;
        charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
        buffer = new int[charBuffer.getBuffer().length];
        // Make the term att large enough
        termAtt.resizeBuffer(2 * maxGram);
    }

    @Override
    public final boolean incrementToken() throws IOException {
        clearAttributes();

        // termination of this loop is guaranteed by the fact that every iteration
        // either advances the buffer (calls consumes()) or increases gramSize
        while (true) {
            // compact
            if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
                System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
                bufferEnd -= bufferStart;
                lastCheckedChar -= bufferStart;
                lastNonTokenChar -= bufferStart;
                bufferStart = 0;

                // fill in remaining space
                exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
                // convert to code points
                bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer,
                        bufferEnd);
            }

            // should we go to the next offset?
            if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
                if (bufferStart + 1 + minGram > bufferEnd) {
                    assert exhausted;
                    return false;
                }
                consume();
                gramSize = minGram;
            }

            updateLastNonTokenChar();

            // retry if the token to be emitted was going to not only contain token chars
            final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart
                    && lastNonTokenChar < (bufferStart + gramSize);
            final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
            if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
                consume();
                gramSize = minGram;
                continue;
            }

            final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
            termAtt.setLength(length);
            posIncAtt.setPositionIncrement(1);
            posLenAtt.setPositionLength(1);
            offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
            ++gramSize;
            return true;
        }
    }

    private void updateLastNonTokenChar() {
        final int termEnd = bufferStart + gramSize - 1;
        if (termEnd > lastCheckedChar) {
            for (int i = termEnd; i > lastCheckedChar; --i) {
                if (!isTokenChar(buffer[i])) {
                    lastNonTokenChar = i;
                    break;
                }
            }
            lastCheckedChar = termEnd;
        }
    }

    /** Consume one code point. */
    private void consume() {
        offset += Character.charCount(buffer[bufferStart++]);
    }

    /** Only collect characters which satisfy this condition. */
    protected boolean isTokenChar(int chr) {
        return true;
    }

    @Override
    public final void end() throws IOException {
        super.end();
        assert bufferStart <= bufferEnd;
        int endOffset = offset;
        for (int i = bufferStart; i < bufferEnd; ++i) {
            endOffset += Character.charCount(buffer[i]);
        }
        endOffset = correctOffset(endOffset);
        // set final offset
        offsetAtt.setOffset(endOffset, endOffset);
    }

    @Override
    public final void reset() throws IOException {
        super.reset();
        bufferStart = bufferEnd = buffer.length;
        lastNonTokenChar = lastCheckedChar = bufferStart - 1;
        offset = 0;
        gramSize = minGram;
        exhausted = false;
        charBuffer.reset();
    }
}