com.shaie.LemmatizingTokenizerDemo.java Source code

Introduction

Here is the source code for com.shaie.LemmatizingTokenizerDemo.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.shaie;

import java.io.IOException;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;

import com.google.common.base.Strings;

public class LemmatizingTokenizerDemo {

    @SuppressWarnings("resource")
    public static void main(String[] args) throws Exception {
        final String text = "cars";
        System.out.println("Stem-only analyzer");
        printTokens(new StemOnlyAnalyzer().tokenStream("", new StringReader(text)));
        System.out.println("--------------------");
        System.out.println("Stem-and-original analyzer");
        printTokens(new StemAndOrigAnalyzer().tokenStream("", new StringReader(text)));
    }

    private static void printTokens(TokenStream tokenStream) throws IOException {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            System.out.println(tokenStream);
        }
    }

    /** Analyzer that returns both the stem and the original token. */
    public static final class StemAndOrigAnalyzer extends Analyzer {

        @SuppressWarnings("resource")
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer tokenizer = new LemmatizingTokenizer();
            TokenStream stream = new LowerCaseFilter(tokenizer);
            // stream = new KeywordRepeatFilter(stream);
            stream = new LemmaTokenFilter(stream, true);
            return new TokenStreamComponents(tokenizer, stream);
        }

    }

    /** Analyzer that returns only the stem of a token. */
    public static final class StemOnlyAnalyzer extends Analyzer {

        @SuppressWarnings("resource")
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer tokenizer = new LemmatizingTokenizer();
            TokenStream stream = new LowerCaseFilter(tokenizer);
            stream = new LemmaTokenFilter(stream, false);
            return new TokenStreamComponents(tokenizer, stream);
        }

    }

    public static final class LemmatizingTokenizer extends Tokenizer {

        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        private final DocRefAttribute docRefAtt = addAttribute(DocRefAttribute.class);
        private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
        private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        private final String[] tokens = { "cars", "not_stemmed" };
        private final String[] lemmas = { "car", null };
        private final int[] startOffsets = { 0, 4 };

        private State state;
        private int idx = 0;

        @Override
        public boolean incrementToken() throws IOException {
            if (state != null) {
                restoreState(state);
                posIncrAtt.setPositionIncrement(0);
                termAtt.setEmpty().append(lemmas[idx]);
            }

            if (idx >= tokens.length) { // no more tokens
                return false;
            }

            posIncrAtt.setPositionIncrement(1);
            termAtt.setEmpty().append(tokens[idx]);
            offsetAtt.setOffset(startOffsets[idx], startOffsets[idx] + tokens[idx].length());
            // set additional attributes, such as offsets..
            // setAttribute()
            // setAttribute()
            state = captureState(); // capture the state of all attributes

            docRefAtt.setToken(tokens[idx]);
            if (idx == 0) {
                docRefAtt.setLemma("car"); // that would be the lemma
            } else {
                docRefAtt.setLemma(null); // some tokens may not have a lemma!
            }

            ++idx;
            return true;
        }

        @Override
        public void reset() throws IOException {
            super.reset();
            idx = 0;
            state = null;
        }
    }

    static final class LemmaTokenFilter extends TokenFilter {

        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
        private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
        private final DocRefAttribute docRefAtt = addAttribute(DocRefAttribute.class);

        private final boolean preserveOriginalToken;

        private boolean returnedLemma = false;
        private boolean returnedOriginalToken = false;
        private boolean handledLemma = false;

        public LemmaTokenFilter(TokenStream input, boolean preserveOriginalToken) {
            super(input);
            this.preserveOriginalToken = preserveOriginalToken;
        }

        @Override
        public boolean incrementToken() throws IOException {
            // If we need to return the original token, do so before consuming the next token from the stream.
            if (handledLemma && !returnedOriginalToken) {
                termAtt.setEmpty().append(docRefAtt.token());
                if (returnedLemma) {
                    posIncrAtt.setPositionIncrement(0); // Original token is returned at the same position as the lemma.
                }
                returnedOriginalToken = true;
                return true;
            }

            // Consume the next token from the stream.
            if (!input.incrementToken()) {
                return false;
            }

            handledLemma = true;

            // If there is a lemma, return it first.
            final String lemma = docRefAtt.lemma();
            if (!Strings.isNullOrEmpty(lemma)) {
                termAtt.setEmpty().append(lemma);
                returnedLemma = true;
                // Mark to return the original token, if needed.
                returnedOriginalToken = !preserveOriginalToken;
                return true;
            }

            // There is no lemma, so proceed to return the original token.
            returnedLemma = false;
            returnedOriginalToken = false;
            return incrementToken();
        }

        @Override
        public void reset() throws IOException {
            super.reset();
            handledLemma = false;
            returnedLemma = false;
            returnedOriginalToken = false;
        }
    }

    public static interface DocRefAttribute extends Attribute {
        public String token();

        public void setToken(String token);

        public String lemma();

        public void setLemma(String lemma);

        public void clear();
    }

    public static final class DocRefAttributeImpl extends AttributeImpl implements DocRefAttribute {

        private String token;
        private String lemma;

        @Override
        public void clear() {
            token = null;
            lemma = null;
        }

        @Override
        public void copyTo(AttributeImpl target) {
            final DocRefAttribute other = (DocRefAttribute) target;
            other.setToken(token);
            other.setLemma(lemma);
        }

        @Override
        public String toString() {
            return "DocRefAttribute token=" + token + ", lemma=" + lemma;
        }

        @Override
        public String token() {
            return token;
        }

        @Override
        public void setToken(String token) {
            this.token = token;
        }

        @Override
        public String lemma() {
            return lemma;
        }

        @Override
        public void setLemma(String lemma) {
            this.lemma = lemma;
        }

        @Override
        public void reflectWith(AttributeReflector reflector) {
            reflector.reflect(DocRefAttribute.class, "token", token);
            reflector.reflect(DocRefAttribute.class, "lemma", lemma);
        }
    }

}