Example usage for org.apache.lucene.analysis Token clear

List of usage examples for org.apache.lucene.analysis Token clear

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token clear.

Prototype

@Override
public void clear() 

Source Link

Document

Resets the term text, payload, flags, positionIncrement, positionLength, startOffset, endOffset and token type to default.

Usage

From source file:analysis.StandardTokenizer.java

License:Apache License

@SuppressWarnings("deprecation")
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    int posIncr = 1;
    Token result = reusableToken;// w  ww. j  a  va 2  s. c  o m
    if (tokenList.size() > 0)
        return tokenList.remove();
    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerImpl.YYEOF) {
            return null;
        }

        if (scanner.yylength() <= maxTokenLength) {
            reusableToken.clear();
            reusableToken.setPositionIncrement(posIncr);
            scanner.getText(reusableToken);
            final int start = scanner.yychar();
            reusableToken.setStartOffset(start);
            reusableToken.setEndOffset(start + reusableToken.termLength());
            // This 'if' should be removed in the next release. For now, it
            // converts
            // invalid acronyms to HOST. When removed, only the 'else' part
            // should
            // remain.
            if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
                if (replaceInvalidAcronym) {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                    reusableToken.setTermLength(reusableToken.termLength() - 1); // remove
                    // extra
                    // '.'
                    tokenType = StandardTokenizerImpl.HOST;
                } else {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                    tokenType = StandardTokenizerImpl.ACRONYM;
                }
            } else {

                reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            }

            if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM
                    || tokenType == StandardTokenizerImpl.ALPHANUM) {

                Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term()));
                Token tk = null;
                int st = reusableToken.startOffset();
                final Token token = new Token();
                while ((tk = lt.next(token)) != null) {
                    tk.setStartOffset(tk.startOffset() + st);
                    tk.setEndOffset(tk.endOffset() + st);
                    tk.setType(reusableToken.type());
                    tokenList.add((Token) tk.clone());
                }
            }
            if (tokenList.size() > 0)
                result = tokenList.remove();

            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:au.edu.unimelb.csse.analyser.String2NodesParser.java

License:Apache License

public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    nodesPosition++;/* ww  w  .j a  v a  2s  . c o m*/
    if (nodesPosition < nodes.size()) {
        reusableToken.clear();
        Node node = nodes.get(nodesPosition);
        reusableToken.setTermBuffer(node.name);
        reusableToken.setPayload(node.getPayload());
        return reusableToken;
    }
    return null;
}

From source file:com.zb.mmseg.analysis.TokenUtils.java

License:Open Source License

/**
 * @param input/*from  w w  w  . j av a  2  s.  co m*/
 * @param reusableToken is null well new one auto.
 * @return null - if not next token or input is null.
 * @throws IOException
 */
public static Token nextToken(TokenStream input, Token reusableToken) throws IOException {
    if (input == null) {
        return null;
    }
    if (!input.incrementToken()) {
        return null;
    }

    CharTermAttribute termAtt = input.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = input.getAttribute(TypeAttribute.class);

    if (reusableToken == null) {
        reusableToken = new Token();
    }

    reusableToken.clear();
    if (termAtt != null) {
        // lucene 3.0
        // reusableToken.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
        // lucene 3.1
        reusableToken.copyBuffer(termAtt.buffer(), 0, termAtt.length());
    }
    if (offsetAtt != null) {
        // lucene 3.1
        // reusableToken.setStartOffset(offsetAtt.startOffset());
        // reusableToken.setEndOffset(offsetAtt.endOffset());
        // lucene 4.0
        reusableToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
    }

    if (typeAtt != null) {
        reusableToken.setType(typeAtt.type());
    }

    return reusableToken;
}

From source file:org.apache.jackrabbit.core.query.lucene.SingletonTokenStream.java

License:Apache License

/**
 * {@inheritDoc}// w w w . ja  v  a  2  s .  c  o m
 */
public Token next(Token reusableToken) throws IOException {
    if (value == null) {
        return null;
    }
    reusableToken.clear();
    reusableToken.setTermBuffer(value);
    reusableToken.setPayload(payload);
    reusableToken.setStartOffset(0);
    reusableToken.setEndOffset(value.length());
    value = null;
    return reusableToken;
}

From source file:org.dutir.tokenizer.CharTokenizer.java

License:Apache License

public final Token next(Token token) throws IOException {
    if (bufferIndex >= dataLen) {
        return null;
    }//from   www. j av  a  2s  .co m
    token.clear();
    int length = 0;
    int start = bufferIndex;
    char[] buffer = token.termBuffer();
    while (true) {
        if (bufferIndex >= dataLen) {
            break;
        }
        final char c = ioBuffer[bufferIndex++];

        if (isTokenChar(c)) { // if it's a token char

            if (length == 0) // start of token
                start = offset + bufferIndex - 1;
            else if (length == buffer.length)
                buffer = token.resizeTermBuffer(1 + length);

            buffer[length++] = normalize(c); // buffer it, normalized

            if (length == MAX_WORD_LEN) // buffer overflow!
                break;

        } else if (length > 0) // at non-Letter w/ chars
            break; // return 'em
    }

    token.setTermLength(length);
    token.setStartOffset(start);
    token.setEndOffset(start + length);
    if (stemTag) {
        //         token.setTermText(PorterStemmer.stem(token.termText()));

    }
    //      System.out.println("entoken:" + token);
    return token;
}

From source file:org.sindice.solr.plugins.analysis.CustomStandardTokenizer.java

License:Apache License

public Token next(Token result) throws IOException {
    int posIncr = 1;

    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == CustomStandardTokenizerImpl.YYEOF) {
            return null;
        }//from  ww  w .  j a  v  a2  s.  c o m

        if (scanner.yylength() <= maxTokenLength) {
            result.clear();
            result.setPositionIncrement(posIncr);
            scanner.getText(result);
            final int start = scanner.yychar();
            result.setStartOffset(start);
            result.setEndOffset(start + result.termLength());
            result.setType(CustomStandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:org.xerela.provider.configstore.ZLuceneTokenizer.java

License:Mozilla Public License

/** {@inheritDoc} */
@Override/* ww  w  .  j  a v  a2  s  .c om*/
public Token next(Token token) throws IOException {
    currentToken.setLength(0);

    int startOffset = offset;
    int endOffset = offset;
    boolean tokenStarted = false;
    while (true) {
        int c = reader.read();
        if (c == -1) {
            endOffset = offset - 1;
            break;
        }

        if (IGNORE_CHAR[c]) {
            if (tokenStarted) {
                endOffset = offset;
                ++offset;
                break;
            }

            ++offset;
            continue;
        }

        if (!tokenStarted) {
            startOffset = offset;
            tokenStarted = true;
        }

        currentToken.append((char) c);
        ++offset;
    }

    if (currentToken.length() == 0) {
        return null;
    }

    token.clear();
    token.setTermText(currentToken.toString());
    token.setTermLength(currentToken.length());
    token.setStartOffset(startOffset);
    token.setEndOffset(endOffset);

    return token;
}