org.apache.lucene.analysis.opennlp.OpenNLPTokenizer.java Source code

Introduction

Here is the source code for org.apache.lucene.analysis.opennlp.OpenNLPTokenizer.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.opennlp;

import java.io.IOException;

import opennlp.tools.util.Span;

import org.apache.lucene.analysis.opennlp.tools.NLPSentenceDetectorOp;
import org.apache.lucene.analysis.opennlp.tools.NLPTokenizerOp;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
import org.apache.lucene.util.AttributeFactory;

/**
 * Run OpenNLP SentenceDetector and Tokenizer.
 * The last token in each sentence is marked by setting the {@link #EOS_FLAG_BIT} in the FlagsAttribute;
 * following filters can use this information to apply operations to tokens one sentence at a time.
 */
public final class OpenNLPTokenizer extends SegmentingTokenizerBase {
    public static int EOS_FLAG_BIT = 1;

    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    private Span[] termSpans = null;
    private int termNum = 0;
    private int sentenceStart = 0;

    private NLPSentenceDetectorOp sentenceOp = null;
    private NLPTokenizerOp tokenizerOp = null;

    public OpenNLPTokenizer(AttributeFactory factory, NLPSentenceDetectorOp sentenceOp, NLPTokenizerOp tokenizerOp)
            throws IOException {
        super(factory, new OpenNLPSentenceBreakIterator(sentenceOp));
        if (sentenceOp == null || tokenizerOp == null) {
            throw new IllegalArgumentException(
                    "OpenNLPTokenizer: both a Sentence Detector and a Tokenizer are required");
        }
        this.sentenceOp = sentenceOp;
        this.tokenizerOp = tokenizerOp;
    }

    @Override
    public void close() throws IOException {
        super.close();
        termSpans = null;
        termNum = sentenceStart = 0;
    };

    @Override
    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
        this.sentenceStart = sentenceStart;
        String sentenceText = new String(buffer, sentenceStart, sentenceEnd - sentenceStart);
        termSpans = tokenizerOp.getTerms(sentenceText);
        termNum = 0;
    }

    @Override
    protected boolean incrementWord() {
        if (termSpans == null || termNum == termSpans.length) {
            return false;
        }
        clearAttributes();
        Span term = termSpans[termNum];
        termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
        offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
                correctOffset(offset + sentenceStart + term.getEnd()));
        if (termNum == termSpans.length - 1) {
            flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
        }
        ++termNum;
        return true;
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        termSpans = null;
        termNum = sentenceStart = 0;
    }
}