Example usage for org.apache.lucene.analysis Token Token

List of usage examples for org.apache.lucene.analysis Token Token

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token Token.

Prototype

public Token(CharSequence text, int posInc, int start, int end) 

Source Link

Document

Constructs a Token with the given term text, position increment, start and end offsets

Usage

From source file:brazilianStemmer.BrazilianStemFilter.java

License:Apache License

/**
 * @return Returns the next token in the stream, or null at EOS.
 *//*from  w  w  w  .ja va2 s.  c om*/
public final Token next() throws IOException {
    if ((token = input.next()) == null) {
        return null;
    }
    // Check the exclusiontable.
    else if (exclusions != null && exclusions.contains(token.termText())) {
        return token;
    } else {
        String s = stemmer.stem(token.termText());
        // If not stemmed, dont waste the time creating a new token.
        if ((s != null) && !s.equals(token.termText())) {
            return new Token(s, token.startOffset(), token.endOffset(), token.type());
        }
        return token;
    }
}

From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java

License:Open Source License

/**
 * Returns the next token in the stream, or <code>null</code> at EOF.
 *///from w  ww.  j  a  v a2  s. c om
public Token next() throws IOException {
    if (encoder == null) {
        return input.next();
    } else if (encoder.length == 1) { // optimize, if one encoder only
        final Token t = input.next();
        if (t == null)
            return null;
        return (encoder[0] != null)
                ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type())
                : t;
    } else {
        ++actualIndex;

        // get next token, if necessary
        if (actualIndex >= encoder.length) {
            actualToken = input.next();
            if (actualToken == null) {
                actualIndex = encoder.length;
                return null;
            }
            actualIndex = 0;
        }

        if (encoder[actualIndex] == null)
            return actualToken;
        else
            return new Token(
                    encoder[actualIndex].toString() + ":"
                            + encoder[actualIndex].generateKey(actualToken.termText()),
                    actualToken.startOffset(), actualToken.endOffset(), actualToken.type());
    }
}

From source file:com.globalsight.ling.lucene.analysis.cjk.CJKTokenizer.java

License:Apache License

/**
 * Returns the next token in the stream, or null at EOS.
 * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
 * for detail.//from   ww w.ja va2s .c o m
 *
 * @return Token
 *
 * @throws IOException - throw IOException when read error
 * happened in the InputStream
 *
 */
public final Token next() throws IOException {
    /** how many character(s) has been stored in buffer */
    int length = 0;

    /** the position used to create Token */
    int start = offset;

    while (true) {
        /** current charactor */
        char c;

        /** unicode block of current charactor for detail */
        Character.UnicodeBlock ub;

        offset++;

        if (bufferIndex >= dataLen) {
            dataLen = input.read(ioBuffer);
            bufferIndex = 0;
        }

        if (dataLen == -1) {
            if (length > 0) {
                if (preIsTokened == true) {
                    length = 0;
                    preIsTokened = false;
                }

                break;
            } else {
                return null;
            }
        } else {
            //get current character
            c = ioBuffer[bufferIndex++];

            //get the UnicodeBlock of the current character
            ub = Character.UnicodeBlock.of(c);
        }

        //if the current character is ASCII or Extend ASCII
        if ((ub == Character.UnicodeBlock.BASIC_LATIN)
                || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)) {
            if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
                /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
                int i = (int) c;
                i = i - 65248;
                c = (char) i;
            }

            // if the current character is a letter or "_" "+" "#"
            if (Character.isLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#'))) {
                if (length == 0) {
                    // "javaC1C2C3C4linux" <br>
                    //      ^--: the current character begin to token the ASCII
                    // letter
                    start = offset - 1;
                } else if (tokenType == "double") {
                    // "javaC1C2C3C4linux" <br>
                    //              ^--: the previous non-ASCII
                    // : the current character
                    offset--;
                    bufferIndex--;
                    tokenType = "single";

                    if (preIsTokened == true) {
                        // there is only one non-ASCII has been stored
                        length = 0;
                        preIsTokened = false;

                        break;
                    } else {
                        break;
                    }
                }

                // store the LowerCase(c) in the buffer
                buffer[length++] = Character.toLowerCase(c);
                tokenType = "single";

                // break the procedure if buffer overflowed!
                if (length == MAX_WORD_LEN) {
                    break;
                }
            } else if (length > 0) {
                if (preIsTokened == true) {
                    length = 0;
                    preIsTokened = false;
                } else {
                    break;
                }
            }
        } else {
            // non-ASCII letter, eg."C1C2C3C4"
            if (Character.isLetter(c)) {
                if (length == 0) {
                    start = offset - 1;
                    buffer[length++] = c;
                    tokenType = "double";
                } else {
                    if (tokenType == "single") {
                        offset--;
                        bufferIndex--;

                        //return the previous ASCII characters
                        break;
                    } else {
                        buffer[length++] = c;
                        tokenType = "double";

                        if (length == 2) {
                            offset--;
                            bufferIndex--;
                            preIsTokened = true;

                            break;
                        }
                    }
                }
            } else if (length > 0) {
                if (preIsTokened == true) {
                    // empty the buffer
                    length = 0;
                    preIsTokened = false;
                } else {
                    break;
                }
            }
        }
    }

    return new Token(new String(buffer, 0, length), start, start + length, tokenType);
}

From source file:com.globalsight.ling.lucene.analysis.de.GermanStemFilter.java

License:Apache License

/**
 * @return  Returns the next token in the stream, or null at EOS
 *//*from w  w w  . j  a  v  a  2  s . c  o m*/
public final Token next() throws IOException {
    token = getNextToken();

    if (token == null) {
        return null;
    }
    // Check the exclusiontable
    else if (exclusionSet != null && exclusionSet.contains(token.toString())) {
        return token;
    } else {
        String s = stemmer.stem(token.toString());
        // If not stemmed, dont waste the time creating a new token
        if (!s.equals(token.toString())) {
            return new Token(s, token.startOffset(), token.endOffset(), token.type());
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.fr.FrenchStemFilter.java

License:Apache License

/**
 * @return  Returns the next token in the stream, or null at EOS
 *///from ww w.  j av  a  2 s. com
public final Token next() throws IOException {
    token = getNextToken();

    if (token == null) {
        return null;
    }
    // Check the exclusiontable
    else if (exclusions != null && exclusions.contains(token.toString())) {
        return token;
    } else {
        String s = stemmer.stem(token.toString());
        // If not stemmed, dont waste the time creating a new token
        if (!s.equals(token.toString())) {
            return new Token(s, 0, s.length(), token.type());
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.ngram.NgramTokenizer.java

License:Apache License

/**
 * Returns the next token in the stream, or null at EOS.
 *
 * @return Token//from w  ww.j  av a  2 s  .  c om
 * @throws IOException - throw IOException when read error
 * happened in the InputStream
 */
final public Token next() throws IOException {
    Token result;

    // First time around, read the entire input.
    if (m_buffer == null) {
        m_buffer = fillBuffer();

        // If input is too short for a full ngram, return null.
        if (m_buffer.length() < m_ngram) {
            m_offset = m_buffer.length();
            return null;
        }
    }

    if (m_offset + m_ngram <= m_buffer.length()) {
        result = new Token(m_buffer.substring(m_offset, m_offset + m_ngram), m_offset, m_offset + m_ngram - 1,
                "ngram");

        ++m_offset;

        return result;
    }

    return null;
}

From source file:com.globalsight.ling.lucene.analysis.nl.DutchStemFilter.java

License:Apache License

/**
 * @return Returns the next token in the stream, or null at EOS
 */// ww  w .  j a  v  a2 s .  co m
public Token next() throws IOException {
    token = getNextToken();

    if (token == null) {
        return null;
    }
    // Check the exclusiontable
    else if (exclusions != null && exclusions.contains(token.toString())) {
        return token;
    } else {
        String s = stemmer.stem(token.toString());
        // If not stemmed, dont waste the time creating a new token
        if (!s.equals(token.toString())) {
            return new Token(s, token.startOffset(), token.endOffset(), token.type());
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java

License:Apache License

/** Returns the next input Token, after being stemmed */
public final Token next() throws IOException {
    Token token = getNextToken();/* www .  j a v a2  s.c  o m*/

    if (token == null) {
        return null;
    } else {
        String s = stemmer.stem(token.toString(), true);

        if (!s.equals(token.toString())) {
            // reconstruct the input token. This is silly...
            Token res = new Token(s, token.startOffset(), token.endOffset(), token.type());
            res.setPositionIncrement(token.getPositionIncrement());
            return res;
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.pt_br.BrazilianStemFilter.java

License:Apache License

/**
 * @return Returns the next token in the stream, or null at EOS.
 *//*from  w  w  w .  j av a  2 s .  com*/
public final Token next() throws IOException {
    token = getNextToken();

    if (token == null) {
        return null;
    }
    // Check the exclusiontable.
    else if (exclusions != null && exclusions.contains(token.toString())) {
        return token;
    } else {
        String s = stemmer.stem(token.toString());

        // If not stemmed, dont waste the time creating a new token.
        if (s != null && !s.equals(token.toString())) {
            return new Token(s, 0, s.length(), token.type());
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.ru.RussianStemFilter.java

License:Apache License

/**
 * @return  Returns the next token in the stream, or null at EOS
 *//*from w w  w . j  av  a  2s  .co m*/
public final Token next() throws IOException {
    token = getNextToken();

    if (token == null) {
        return null;
    } else {
        String s = stemmer.stem(token.toString());

        if (!s.equals(token.toString())) {
            return new Token(s, token.startOffset(), token.endOffset(), token.type());
        }

        return token;
    }
}