com.twitter.common.text.token.TwitterTokenStream.java Source code

Java tutorial

Introduction

Here is the source code for com.twitter.common.text.token.TwitterTokenStream.java

Source

// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================

package com.twitter.common.text.token;

import java.util.List;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.AttributeSource;

import com.twitter.common.text.example.TokenizerUsageExample;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;

/**
 * Abstraction to enumerate a sequence of tokens. This class represents the central abstraction in
 * Twitter's text processing library, and is similar to Lucene's TokenStream, with the following
 * exceptions:
 *
 * <ul>
 * <li>This class assumes that the input text is a {@link CharSequence}.
 * <li>Calls support chaining.
 * <li>Instances are reusable.
 * </ul>
 *
 * For an annotated example of how this class is used in practice, refer to
 * {@link TokenizerUsageExample}.
 */
public abstract class TwitterTokenStream extends TokenStream {
    private final CharSequenceTermAttribute termAttribute = addAttribute(CharSequenceTermAttribute.class);
    private final TokenTypeAttribute typeAttribute = addAttribute(TokenTypeAttribute.class);

    /**
     * Constructs a {@code TwitterTokenStream} using the default attribute factory.
     */
    public TwitterTokenStream() {
        super();
    }

    /**
     * Constructs a {@code TwitterTokenStream} using the supplied {@code AttributeFactory} for creating new
     * {@code Attribute} instances.
     *
     * @param factory attribute factory
     */
    protected TwitterTokenStream(AttributeSource.AttributeFactory factory) {
        super(factory);
    }

    /**
     * Constructs a {@code TwitterTokenStream} that uses the same attributes as the supplied one.
     *
     * @param input attribute source
     */
    protected TwitterTokenStream(AttributeSource input) {
        super(input);
    }

    /**
     * Consumers call this method to advance the stream to the next token.
     *
     * @return false for end of stream; true otherwise
     */
    public abstract boolean incrementToken();

    /**
     * Resets this {@code TwitterTokenStream} (and also downstream tokens if they exist) to parse a new
     * input.
     */
    public void reset(CharSequence input) {
        updateInputCharSequence(input);
        reset();
    };

    /**
     * Subclasses should implement reset() to reinitiate the processing.
     * Input CharSequence is available as inputCharSequence().
     */
    public abstract void reset();

    /**
     * Converts this token stream into a list of {@code Strings}.
     *
     * @return the contents of the token stream as a list of {@code Strings}.
     */
    public List<String> toStringList() {
        List<String> tokens = Lists.newArrayList();

        while (incrementToken()) {
            tokens.add(term().toString());
        }

        return tokens;
    }

    /**
     * Searches and returns an instance of a specified class in this TwitterTokenStream chain.
     *
     * @param cls class to search for
     * @return instance of the class {@code cls} if found or {@code null} if not found
     */
    public <T extends TwitterTokenStream> T getInstanceOf(Class<T> cls) {
        Preconditions.checkNotNull(cls);
        if (cls.isInstance(this)) {
            return cls.cast(this);
        }
        return null;
    }

    /**
     * Returns the offset of the current token.
     *
     * @return offset of the current token.
     */
    public int offset() {
        return termAttribute.getOffset();
    }

    /**
     * Returns the length of the current token.
     *
     * @return length of the current token.
     */
    public int length() {
        return termAttribute.getLength();
    }

    /**
     * Returns the {@code CharSequence} of the current token.
     *
     * @return {@code CharSequence} of the current token
     */
    public CharSequence term() {
        return termAttribute.getTermCharSequence();
    }

    /**
     * Returns the input {@code CharSequence}.
     *
     * @return input {@code CharSequence}
     */
    public CharSequence inputCharSequence() {
        return termAttribute.getCharSequence();
    }

    /**
     * Returns the type of the current token.
     *
     * @return type of the current token.
     */
    public TokenType type() {
        return typeAttribute.getType();
    }

    /**
     * Sets the input {@code CharSequence}.
     *
     * @param inputCharSequence {@code CharSequence} analyzed by this
     *     {@code TwitterTokenStream}
     */
    protected void updateInputCharSequence(CharSequence inputCharSequence) {
        termAttribute.setCharSequence(inputCharSequence);
    }

    /**
     * Updates the offset and length of the current token.
     *
     * @param offset new offset
     * @param length new length
     */
    protected void updateOffsetAndLength(int offset, int length) {
        termAttribute.setOffset(offset);
        termAttribute.setLength(length);
    }

    /**
     * Updates the type of the current token.
     *
     * @param type new type
     */
    protected void updateType(TokenType type) {
        typeAttribute.setType(type);
    }

    @Override
    public boolean equals(Object target) {
        // Lucene's AttributeSource.equals() returns true if this has the same
        // set of attributes as the target one. Let's make it more strict.
        return this == target;
    }

    @Override
    public int hashCode() {
        return System.identityHashCode(this);
    }
}