nl.inl.blacklab.index.complex.TokenStreamWithOffsets.java Source code

Java tutorial

Introduction

Here is the source code for nl.inl.blacklab.index.complex.TokenStreamWithOffsets.java

Source

/*******************************************************************************
 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
/**
 *
 */
package nl.inl.blacklab.index.complex;

import java.util.Iterator;
import java.util.List;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
 * Takes a List<String> plus two List<Integer>'s and iterates through them as a
 * TokenStream.
 *
 * The Strings are taken as terms. The two integer-lists are taken as start chars and end chars.
 * Token position increment is always 1.
 */
class TokenStreamWithOffsets extends TokenStream {
    /**
     * Term text of the current token
     */
    protected CharTermAttribute termAttr;

    /**
     * Position increment of the current token
     */
    protected PositionIncrementAttribute positionIncrementAttr;

    /**
     * Character offsets of the current token
     */
    private OffsetAttribute offsetAttr;

    protected Iterator<String> iterator;

    protected Iterator<Integer> incrementIt;

    private Iterator<Integer> startCharIt;

    private Iterator<Integer> endCharIt;

    public TokenStreamWithOffsets(List<String> tokens, List<Integer> increments, List<Integer> startChar,
            List<Integer> endChar) {
        clearAttributes();
        termAttr = addAttribute(CharTermAttribute.class);
        offsetAttr = addAttribute(OffsetAttribute.class);
        positionIncrementAttr = addAttribute(PositionIncrementAttribute.class);
        positionIncrementAttr.setPositionIncrement(1);

        iterator = tokens.iterator();
        incrementIt = increments.iterator();
        startCharIt = startChar.iterator();
        endCharIt = endChar.iterator();
    }

    @Override
    public boolean incrementToken() {
        // Capture token contents
        if (iterator.hasNext()) {
            String term = iterator.next();
            if (term == null)
                System.err.println("TERM==NULL");
            termAttr.copyBuffer(term.toCharArray(), 0, term.length());
            positionIncrementAttr.setPositionIncrement(incrementIt.next());
            offsetAttr.setOffset(startCharIt.next(), endCharIt.next());
            return true;
        }
        return false;
    }

}