Example usage for org.apache.lucene.analysis Token endOffset

List of usage examples for org.apache.lucene.analysis Token endOffset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token endOffset.

Prototype

@Override
public final int endOffset() 

Source Link

Usage

From source file:analysis.StandardTokenizer.java

License:Apache License

@SuppressWarnings("deprecation")
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    int posIncr = 1;
    Token result = reusableToken;//w w  w .jav  a  2 s.  co m
    if (tokenList.size() > 0)
        return tokenList.remove();
    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerImpl.YYEOF) {
            return null;
        }

        if (scanner.yylength() <= maxTokenLength) {
            reusableToken.clear();
            reusableToken.setPositionIncrement(posIncr);
            scanner.getText(reusableToken);
            final int start = scanner.yychar();
            reusableToken.setStartOffset(start);
            reusableToken.setEndOffset(start + reusableToken.termLength());
            // This 'if' should be removed in the next release. For now, it
            // converts
            // invalid acronyms to HOST. When removed, only the 'else' part
            // should
            // remain.
            if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
                if (replaceInvalidAcronym) {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                    reusableToken.setTermLength(reusableToken.termLength() - 1); // remove
                    // extra
                    // '.'
                    tokenType = StandardTokenizerImpl.HOST;
                } else {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                    tokenType = StandardTokenizerImpl.ACRONYM;
                }
            } else {

                reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            }

            if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM
                    || tokenType == StandardTokenizerImpl.ALPHANUM) {

                Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term()));
                Token tk = null;
                int st = reusableToken.startOffset();
                final Token token = new Token();
                while ((tk = lt.next(token)) != null) {
                    tk.setStartOffset(tk.startOffset() + st);
                    tk.setEndOffset(tk.endOffset() + st);
                    tk.setType(reusableToken.type());
                    tokenList.add((Token) tk.clone());
                }
            }
            if (tokenList.size() > 0)
                result = tokenList.remove();

            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:cc.pp.analyzer.paoding.analyzer.impl.MaxWordLengthTokenCollector.java

License:Apache License

@Override
public void collect(String word, int offset, int end) {
    Token c = candidate != null ? candidate : last;
    if (c == null) {
        candidate = new Token(word, offset, end);
    } else if (offset == c.startOffset()) {
        if (end > c.endOffset()) {
            candidate = new Token(word, offset, end);
        }/* www. j  a v  a2 s  . c o m*/
    } else if (offset > c.startOffset()) {
        if (candidate != null) {
            select(candidate);
        }
        if (end > c.endOffset()) {
            candidate = new Token(word, offset, end);
        } else {
            candidate = null;
        }
    } else if (end >= c.endOffset()) {
        if (last != null && last.startOffset() >= offset && last.endOffset() <= end) {
            for (Iterator/* <Token> */<Token> iter = tokens.iterator(); iter.hasNext();) {
                last = iter.next();
                if (last.startOffset() >= offset && last.endOffset() <= end) {
                    iter.remove();
                }
            }
        }
        last = null;
        candidate = new Token(word, offset, end);
    }
}

From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java

License:Open Source License

/**
 * Returns the next token in the stream, or <code>null</code> at EOF.
 *//*from   www .ja va  2  s  . c  o  m*/
public Token next() throws IOException {
    if (encoder == null) {
        return input.next();
    } else if (encoder.length == 1) { // optimize, if one encoder only
        final Token t = input.next();
        if (t == null)
            return null;
        return (encoder[0] != null)
                ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type())
                : t;
    } else {
        ++actualIndex;

        // get next token, if necessary
        if (actualIndex >= encoder.length) {
            actualToken = input.next();
            if (actualToken == null) {
                actualIndex = encoder.length;
                return null;
            }
            actualIndex = 0;
        }

        if (encoder[actualIndex] == null)
            return actualToken;
        else
            return new Token(
                    encoder[actualIndex].toString() + ":"
                            + encoder[actualIndex].generateKey(actualToken.termText()),
                    actualToken.startOffset(), actualToken.endOffset(), actualToken.type());
    }
}

From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java

License:Apache License

private Token mergeTokens(Token t1, Token t2) {
    Token res = new Token();
    if (null == t1) {
        return t2;
    }//from w ww .j  ava2s  .co  m
    char[] text = (TokenUtil.termText(t1) + "_" + TokenUtil.termText(t2)).toCharArray();
    res.reinit(text, 0, text.length, t1.startOffset(), t2.endOffset());
    return res;
}

From source file:com.flaptor.hounder.searcher.PhraseMatchingFragmenter.java

License:Apache License

public boolean isNewFragment(Token token) {
    boolean isNewFrag = lineBreaker(lastOffset, token.startOffset());
    logger.debug("token: " + TokenUtil.termText(token));
    if (isNewFrag)
        logger.debug("BREAK!");
    lastOffset = token.endOffset();
    return isNewFrag;
}

From source file:com.globalsight.ling.lucene.analysis.GSTokenizer.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    clearAttributes();/*  w w  w  . ja  v a2s  .com*/
    Token tt = next();
    if (tt == null) {
        return false;
    } else {
        gsAtt.setToken(tt);
        termAtt.append(tt.toString());
        offsetAtt.setOffset(tt.startOffset(), tt.endOffset());
        posIncrAtt.setPositionIncrement(tt.getPositionIncrement());

        return true;
    }
}

From source file:com.globalsight.ling.lucene.analysis.ngram.NgramAnalyzer.java

License:Apache License

static void test(String p_text) throws java.io.IOException {
    NgramAnalyzer x = new NgramAnalyzer(3);
    NgramTokenizer y = new NgramTokenizer(new java.io.StringReader(p_text), 3);

    System.out.println("Text = " + p_text);

    Token t;
    while ((t = y.next()) != null) {
        System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")");
    }//from   w  ww .j  av a2  s  .c  o m
}

From source file:com.globalsight.ling.lucene.analysis.ngram.NgramNoPunctuationAnalyzer.java

License:Apache License

static void test(String p_text) throws java.io.IOException {
    Analyzer x = new NgramNoPunctuationAnalyzer(3);
    NgramNoPunctuationTokenizer y = new NgramNoPunctuationTokenizer(new java.io.StringReader(p_text), 3);

    System.out.println("Text = " + p_text);

    Token t;
    while ((t = y.next()) != null) {
        System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")");
    }//from   ww  w  . ja  va  2 s.c  o  m
}

From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java

License:Apache License

/** Returns the next input Token, after being stemmed */
public final Token next() throws IOException {
    Token token = getNextToken();

    if (token == null) {
        return null;
    } else {//from  ww  w . j  a va  2  s. c  om
        String s = stemmer.stem(token.toString(), true);

        if (!s.equals(token.toString())) {
            // reconstruct the input token. This is silly...
            Token res = new Token(s, token.startOffset(), token.endOffset(), token.type());
            res.setPositionIncrement(token.getPositionIncrement());
            return res;
        }

        return token;
    }
}

From source file:com.globalsight.ling.lucene.analysis.ru.RussianLowerCaseFilter.java

License:Apache License

public final Token next() throws java.io.IOException {
    Token t = getNextToken();

    if (t == null)
        return null;

    String txt = t.toString();/*w w w .  j  a  va  2 s .  co  m*/

    char[] chArray = txt.toCharArray();
    for (int i = 0; i < chArray.length; i++) {
        chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
    }

    String newTxt = new String(chArray);
    // create new token
    Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());

    return newToken;
}