nl.inl.blacklab.filter.AbstractSynonymFilter.java Source code

Introduction

Here is the source code for nl.inl.blacklab.filter.AbstractSynonymFilter.java
Source

/*******************************************************************************
 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package nl.inl.blacklab.filter;

import java.io.IOException;
import java.io.StringReader;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;

/**
 * Abstract base class for implementing synonym filters.
 *
 * This can be used for true synonyms, but also for alternative spellings for certain words, such as
 * a version with accented characters transcribed to unaccented versions.
 *
 * This filter could be applied at either index time or search time.
 *
 * Subclasses should override the getSynonyms() method, and for example use a database to find the
 * appropriate synonyms.
 *
 * (Adapted from sample code from Lucene in Action, 2nd ed.)
 */
public abstract class AbstractSynonymFilter extends TokenFilter {
    /** Include the original token, or just the synonyms? */
    private boolean includeOriginalTokens;

    /**
     * Construct a synonym filter.
     *
     * @param input
     *            the input tokens to find synonyms for
     * @param includeOriginalTokens
     *            Include the original tokens, or just the synonyms?
     */
    public AbstractSynonymFilter(TokenStream input, boolean includeOriginalTokens) {
        super(input);
        this.includeOriginalTokens = includeOriginalTokens;
        synonymStack = new Stack<State>();
        termAttr = addAttribute(CharTermAttribute.class);
        addAttribute(PositionIncrementAttribute.class);
        addAttribute(TypeAttribute.class);
        helperAttSource = input.cloneAttributes();
    }

    /**
     * Construct a synonym filter that includes the original tokens.
     *
     * @param input
     *            the input tokens to find synonyms for
     */
    public AbstractSynonymFilter(TokenStream input) {
        this(input, true);
    }

    /**
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {
        TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42, new StringReader("Dit is een test"));
        try {
            ts = new AbstractSynonymFilter(ts) {
                @Override
                public String[] getSynonyms(String s) {
                    if (s.equals("test"))
                        return new String[] { "testje" };
                    if (s.equals("is"))
                        return new String[] { "zijn" };
                    return null;
                }
            };

            CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
            while (ts.incrementToken()) {
                System.out.println(new String(term.buffer(), 0, term.length()));
            }
        } finally {
            ts.close();
        }
    }

    /**
     * Token type for synonyms
     */
    public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";

    /**
     * Get a list of synonyms
     * @param s word to get synonyms for
     * @return the list
     */
    public abstract String[] getSynonyms(String s);

    private Stack<State> synonymStack;

    private CharTermAttribute termAttr;

    /**
     * A copy of the input attributes. Used to construct the states we push on the stack.
     */
    private AttributeSource helperAttSource;

    @Override
    public boolean incrementToken() throws IOException {
        // If we don't want the original token but just the synonyms,
        // we may have to loop to the first synonym. See end of loop.
        do {
            // Do we have any synonyms left?
            if (synonymStack.size() > 0) {
                // Yes, shift one in.
                State syn = synonymStack.pop();
                restoreState(syn);

                // We're at a synonym. This is always ok (regardless of the
                // value of includeOriginalTokens), so exit the loop.
                break;
            }

            if (!input.incrementToken()) {
                // We're done.
                return false;
            }

            addAliasesToStack();

            // Now we're at the original token. This is only ok if
            // includeOriginalTokens == true; hence the loop.
        } while (!includeOriginalTokens);

        return true; // We're at a desired token (original or synonym)
    }

    private void addAliasesToStack() {
        String[] synonyms = getSynonyms(new String(termAttr.buffer(), 0, termAttr.length()));
        if (synonyms == null)
            return;
        State current = captureState();

        for (int i = 0; i < synonyms.length; i++) {
            helperAttSource.restoreState(current);
            setTerm(helperAttSource, synonyms[i]);
            setType(helperAttSource, TOKEN_TYPE_SYNONYM);
            setPositionIncrement(helperAttSource, 0);
            synonymStack.push(helperAttSource.captureState());
        }
    }

    static void setPositionIncrement(AttributeSource source, int posIncr) {
        PositionIncrementAttribute attr = source.addAttribute(PositionIncrementAttribute.class);
        attr.setPositionIncrement(posIncr);
    }

    static void setTerm(AttributeSource source, String term) {
        CharTermAttribute attr = source.addAttribute(CharTermAttribute.class);
        attr.copyBuffer(term.toCharArray(), 0, term.length());
    }

    static void setType(AttributeSource source, String type) {
        TypeAttribute attr = source.addAttribute(TypeAttribute.class);
        attr.setType(type);
    }

}