Example usage for org.apache.lucene.analysis Token Token

List of usage examples for org.apache.lucene.analysis Token Token

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token Token.

Prototype

public Token(CharSequence text, int posInc, int start, int end, int posLength) 

Source Link

Usage

From source file:cc.pp.analyzer.imdict.core.WordSegmenter.java

License:Apache License

/**
 *
 * RawToken???Token ?RawToken? ????/*from w w w .  jav  a  2s.  c o m*/
 *
 * @param rt
 * @param sentence ????
 * @param sentenceStartOffset sentence??
 * @param type tokenword
 * @return
 */
public Token convertSegToken(SegToken st, String sentence, int sentenceStartOffset, String type) {
    Token result;
    switch (st.wordType) {
    case STRING:
    case NUMBER:
    case FULLWIDTH_NUMBER:
    case FULLWIDTH_STRING:
        st.charArray = sentence.substring(st.startOffset, st.endOffset).toCharArray();
        break;
    default:
        break;
    }

    st = tokenFilter.filter(st);

    result = new Token(st.charArray, 0, st.charArray.length, st.startOffset + sentenceStartOffset,
            st.endOffset + sentenceStartOffset);
    return result;
}

From source file:ch.systemsx.cisd.openbis.generic.server.dataaccess.db.search.SeparatorSplitterTokenFilter.java

License:Apache License

private void addToken(Token token, int startPos, int endPos) {
    if (startPos < endPos) {
        int startOffset = token.startOffset() + startPos;
        int endOffset = token.startOffset() + endPos;
        Token newToken = new Token(token.termBuffer(), startPos, endPos - startPos, startOffset, endOffset);
        tokens.add(newToken);//from  www  .j ava 2s  .  c  o m
    }
}

From source file:com.mhs.qsol.proximity.ProximityVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query.//from  ww  w  .ja  v a  2s  .  c  o  m
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {
    if (logger.isLoggable(Level.FINE)) {
        // logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    if (logger.isLoggable(Level.FINE)) {
        logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        return null;
    } else if (v.size() == 1) {
        t = v.get(0);
        SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        stq.setBoost(this.boost);
        return stq;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                SpanQuery[] spanQueries = new SpanQuery[v.size()];

                StringBuilder regex = new StringBuilder();

                for (int i = 0; i < v.size(); i++) {
                    spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString()));
                }

                return new SpanOrQuery(spanQueries);
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length())));
            }

            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.mhs.qsol.QsolToQueryVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query./*  w w  w  .  jav a  2s.co m*/
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        // null's will get cleaned up in visitBooleanOp
        return null;
    } else if (v.size() == 1) {

        t = v.get(0);

        TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        termQuery.setBoost(this.boost);

        return termQuery;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = new BooleanQuery(true);

                for (int i = 0; i < v.size(); i++) {
                    t = v.get(i);

                    TermQuery currentQuery = new TermQuery(
                            new Term(field, new String(t.buffer(), 0, t.length())));
                    currentQuery.setBoost(this.boost);

                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }

                return q;
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                SpanTermQuery spanQuery = new SpanTermQuery(
                        new Term(field, new String(t2.buffer(), 0, t2.length())));
                spanQuery.setBoost(boost);
                clauses[i] = spanQuery;
            }

            // Note: There's a bug here (not by me) that where term offsets are not respected.
            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java

License:Open Source License

private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) {
    Token token = new Token(oriToken.buffer(), termBufferOffset, termBufferLength,
            oriToken.startOffset() + termBufferOffset,
            oriToken.startOffset() + termBufferOffset + termBufferLength);

    if (type == Character.DECIMAL_DIGIT_NUMBER) {
        token.setType(MMSegWord.TYPE_DIGIT);
    } else {/*  w w  w  . j  a v  a2 s.c o m*/
        token.setType(MMSegWord.TYPE_LETTER);
    }

    tokenQueue.offer(token);
}

From source file:edu.mit.ll.vizlinc.highlight.TokenStreamFromTermPositionVector.java

License:Apache License

/**
 * Constructor./*from ww  w .  j a v  a2 s.c  o m*/
 * 
 * @param termPositionVector TermPositionVector that contains the data for
 *        creating the TokenStream. Must have positions and offsets.
 */
public TokenStreamFromTermPositionVector(final TermPositionVector termPositionVector) {
    termAttribute = addAttribute(CharTermAttribute.class);
    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = addAttribute(OffsetAttribute.class);
    final String[] terms = termPositionVector.getTerms();
    for (int i = 0; i < terms.length; i++) {
        final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i);
        final int[] termPositions = termPositionVector.getTermPositions(i);
        for (int j = 0; j < termPositions.length; j++) {
            Token token;
            if (offsets != null) {
                token = new Token(terms[i].toCharArray(), 0, terms[i].length(), offsets[j].getStartOffset(),
                        offsets[j].getEndOffset());
            } else {
                token = new Token();
                token.setEmpty().append(terms[i]);
            }
            // Yes - this is the position, not the increment! This is for
            // sorting. This value
            // will be corrected before use.
            token.setPositionIncrement(termPositions[j]);
            this.positionedTokens.add(token);
        }
    }
    CollectionUtil.mergeSort(this.positionedTokens, tokenComparator);
    int lastPosition = -1;
    for (final Token token : this.positionedTokens) {
        int thisPosition = token.getPositionIncrement();
        token.setPositionIncrement(thisPosition - lastPosition);
        lastPosition = thisPosition;
    }
    this.tokensAtCurrentPosition = this.positionedTokens.iterator();
}

From source file:edu.uci.ics.sourcerer.search.analysis.NoTokenizer.java

License:Open Source License

public Token next() throws IOException {

    if (charArr.size() == 0)
        return null;

    char[] _char = new char[charSize];
    for (int i = 0; i < charSize; i++) {
        _char[i] = charArr.get(i);
    }/*from   w w w.  j  a  v  a2s. c  o m*/

    Token _tok = new Token(_char, 0, charSize, 0, charSize);
    charArr.clear();
    return _tok;
}

From source file:edu.uci.ics.sourcerer.search.analysis.SingleSpaceTokenizer.java

License:Open Source License

public Token next() throws IOException {

    if (charArr.size() == 0)
        return null;

    char[] _char = new char[charSize];

    int tokenSize = 0;
    for (int i = 0; i + start < charSize; i++) {

        char _c = charArr.get(i + start);

        if (isTokenChar(_c)) {
            tokenSize++;// ww w. j  av a  2s.  c o  m
            _char[i] = _c;
        } else {
            break;
        }
    }

    Token _tok = new Token(_char, 0, tokenSize, start, start + tokenSize); //.., offset, length, start, end
    if (start + tokenSize == charSize)
        charArr.clear();
    start = start + tokenSize + 1;
    return _tok;
}

From source file:magoffin.matt.lucene.DigitTokenizer.java

License:Open Source License

@Override
public Token next() throws IOException {
    if (complete) {
        return null;
    }/*from  w  w  w  .ja  va 2s. c o  m*/

    // read in entire string
    StringWriter out = new StringWriter();
    FileCopyUtils.copy(this.input, out);
    int end = out.getBuffer().length();
    String numbers = out.toString().replaceAll("\\D", "");

    if (maxLength > 0 && numbers.length() > maxLength) {
        numbers = numbers.substring(numbers.length() - maxLength);
    }
    complete = true;
    char[] numChars = numbers.toCharArray();
    return new Token(numChars, 0, numChars.length, 0, end);
}

From source file:magoffin.matt.lucene.KeyTokenizer.java

License:Open Source License

@Override
public Token next() throws IOException {
    if (complete) {
        return null;
    }/*from w  ww  .  j a  v  a  2 s  .co  m*/

    int numRead = input.read(buffer);
    String key = "";
    if (numRead > 0) {
        if (!trim) {
            key = new String(buffer, 0, numRead);
        } else {
            if (buffer.length == 1) {
                while (Character.isWhitespace(buffer[0])) {
                    numRead = input.read(buffer);
                    if (numRead < 1) {
                        break;
                    }
                }
            } else {
                if (numRead < buffer.length) {
                    Arrays.fill(buffer, numRead, buffer.length, ' ');
                }
                int i = 0;
                while (i < buffer.length && numRead > 0) {
                    int start = i;
                    for (; i < buffer.length && Character.isWhitespace(buffer[i]); i++) {
                        // skip
                    }
                    if (i > start) {
                        // found whitespace at beginning, so discard them and 
                        // read more from stream
                        System.arraycopy(buffer, i, buffer, start, buffer.length - i);
                        numRead = input.read(buffer, buffer.length - i, i);
                    } else {
                        break;
                    }
                }
            }
            key = new String(buffer).trim();
            if (key.length() < 1) {
                return null;
            }
        }
    }
    complete = true;
    char[] keyChar = key.toCharArray();
    return new Token(keyChar, 0, keyChar.length, 0, numRead - 1);
}