Example usage for org.apache.lucene.analysis Token startOffset

List of usage examples for org.apache.lucene.analysis Token startOffset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token startOffset.

Prototype

@Override
public final int startOffset() 

Source Link

Usage

From source file:analysis.StandardTokenizer.java

License:Apache License

@SuppressWarnings("deprecation")
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    int posIncr = 1;
    Token result = reusableToken;/* w  w w .  j  ava  2  s .c  om*/
    if (tokenList.size() > 0)
        return tokenList.remove();
    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerImpl.YYEOF) {
            return null;
        }

        if (scanner.yylength() <= maxTokenLength) {
            reusableToken.clear();
            reusableToken.setPositionIncrement(posIncr);
            scanner.getText(reusableToken);
            final int start = scanner.yychar();
            reusableToken.setStartOffset(start);
            reusableToken.setEndOffset(start + reusableToken.termLength());
            // This 'if' should be removed in the next release. For now, it
            // converts
            // invalid acronyms to HOST. When removed, only the 'else' part
            // should
            // remain.
            if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
                if (replaceInvalidAcronym) {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                    reusableToken.setTermLength(reusableToken.termLength() - 1); // remove
                    // extra
                    // '.'
                    tokenType = StandardTokenizerImpl.HOST;
                } else {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                    tokenType = StandardTokenizerImpl.ACRONYM;
                }
            } else {

                reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            }

            if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM
                    || tokenType == StandardTokenizerImpl.ALPHANUM) {

                Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term()));
                Token tk = null;
                int st = reusableToken.startOffset();
                final Token token = new Token();
                while ((tk = lt.next(token)) != null) {
                    tk.setStartOffset(tk.startOffset() + st);
                    tk.setEndOffset(tk.endOffset() + st);
                    tk.setType(reusableToken.type());
                    tokenList.add((Token) tk.clone());
                }
            }
            if (tokenList.size() > 0)
                result = tokenList.remove();

            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:cc.pp.analyzer.imdict.core.WordSegmenter.java

License:Apache License

/**
 * HHMMSegment??sentence Token???Token List
 *
 * @param sentenceToken ??Token/* w w  w. jav  a2 s.co m*/
 * @param shortPathCount HHMM????
 * @return ?Token List
 */
public List<Token> segmentSentence(Token sentenceToken, int shortPathCount) {

    String sentence = sentenceToken.term();

    List<SegToken> segTokenList = hhmmSegmenter.process(sentence);

    List<Token> result = new ArrayList<Token>();

    // i1rawTokens.length-2##?##?RawToken
    for (int i = 1; i < segTokenList.size() - 1; i++) {
        result.add(convertSegToken(segTokenList.get(i), sentence, sentenceToken.startOffset(), "word"));
    }
    return result;

}

From source file:cc.pp.analyzer.paoding.analyzer.impl.MaxWordLengthTokenCollector.java

License:Apache License

@Override
public void collect(String word, int offset, int end) {
    Token c = candidate != null ? candidate : last;
    if (c == null) {
        candidate = new Token(word, offset, end);
    } else if (offset == c.startOffset()) {
        if (end > c.endOffset()) {
            candidate = new Token(word, offset, end);
        }//from  w  w  w .  j av  a 2s. c om
    } else if (offset > c.startOffset()) {
        if (candidate != null) {
            select(candidate);
        }
        if (end > c.endOffset()) {
            candidate = new Token(word, offset, end);
        } else {
            candidate = null;
        }
    } else if (end >= c.endOffset()) {
        if (last != null && last.startOffset() >= offset && last.endOffset() <= end) {
            for (Iterator/* <Token> */<Token> iter = tokens.iterator(); iter.hasNext();) {
                last = iter.next();
                if (last.startOffset() >= offset && last.endOffset() <= end) {
                    iter.remove();
                }
            }
        }
        last = null;
        candidate = new Token(word, offset, end);
    }
}

From source file:ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SeparatorSplitterTokenFilter.java

License:Apache License

private void addToken(Token token, int startPos, int endPos) {
    if (startPos < endPos) {
        int startOffset = token.startOffset() + startPos;
        int endOffset = token.startOffset() + endPos;
        Token newToken = new Token(token.termBuffer(), startPos, endPos - startPos, startOffset, endOffset);
        tokens.add(newToken);/* w  ww.  j av  a 2s  .  c  o m*/
    }
}

From source file:com.fdt.sdl.core.analyzer.phonetix.lucene.PhoneticFilter.java

License:Open Source License

/**
 * Returns the next token in the stream, or <code>null</code> at EOF.
 *///from ww  w  .  j a v  a 2s .c o  m
public Token next() throws IOException {
    if (encoder == null) {
        return input.next();
    } else if (encoder.length == 1) { // optimize, if one encoder only
        final Token t = input.next();
        if (t == null)
            return null;
        return (encoder[0] != null)
                ? new Token(encoder[0].generateKey(t.termText()), t.startOffset(), t.endOffset(), t.type())
                : t;
    } else {
        ++actualIndex;

        // get next token, if necessary
        if (actualIndex >= encoder.length) {
            actualToken = input.next();
            if (actualToken == null) {
                actualIndex = encoder.length;
                return null;
            }
            actualIndex = 0;
        }

        if (encoder[actualIndex] == null)
            return actualToken;
        else
            return new Token(
                    encoder[actualIndex].toString() + ":"
                            + encoder[actualIndex].generateKey(actualToken.termText()),
                    actualToken.startOffset(), actualToken.endOffset(), actualToken.type());
    }
}

From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java

License:Apache License

private Token mergeTokens(Token t1, Token t2) {
    Token res = new Token();
    if (null == t1) {
        return t2;
    }/*  w w w.  ja va2  s  .c o m*/
    char[] text = (TokenUtil.termText(t1) + "_" + TokenUtil.termText(t2)).toCharArray();
    res.reinit(text, 0, text.length, t1.startOffset(), t2.endOffset());
    return res;
}

From source file:com.flaptor.hounder.searcher.PhraseMatchingFragmenter.java

License:Apache License

public boolean isNewFragment(Token token) {
    boolean isNewFrag = lineBreaker(lastOffset, token.startOffset());
    logger.debug("token: " + TokenUtil.termText(token));
    if (isNewFrag)
        logger.debug("BREAK!");
    lastOffset = token.endOffset();//from   w  w w . j  a  va  2s .c om
    return isNewFrag;
}

From source file:com.globalsight.ling.lucene.analysis.GSTokenizer.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    clearAttributes();//from ww w  . j a v a 2 s .  c  o m
    Token tt = next();
    if (tt == null) {
        return false;
    } else {
        gsAtt.setToken(tt);
        termAtt.append(tt.toString());
        offsetAtt.setOffset(tt.startOffset(), tt.endOffset());
        posIncrAtt.setPositionIncrement(tt.getPositionIncrement());

        return true;
    }
}

From source file:com.globalsight.ling.lucene.analysis.ngram.NgramAnalyzer.java

License:Apache License

static void test(String p_text) throws java.io.IOException {
    NgramAnalyzer x = new NgramAnalyzer(3);
    NgramTokenizer y = new NgramTokenizer(new java.io.StringReader(p_text), 3);

    System.out.println("Text = " + p_text);

    Token t;
    while ((t = y.next()) != null) {
        System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")");
    }/*from  w w  w  .j  a va 2s. c  o m*/
}

From source file:com.globalsight.ling.lucene.analysis.ngram.NgramNoPunctuationAnalyzer.java

License:Apache License

static void test(String p_text) throws java.io.IOException {
    Analyzer x = new NgramNoPunctuationAnalyzer(3);
    NgramNoPunctuationTokenizer y = new NgramNoPunctuationTokenizer(new java.io.StringReader(p_text), 3);

    System.out.println("Text = " + p_text);

    Token t;
    while ((t = y.next()) != null) {
        System.out.println(t.toString() + " (" + t.startOffset() + ":" + t.endOffset() + ")");
    }/*w  w  w  .  ja v a 2 s  .  c  o  m*/
}