Example usage for org.apache.lucene.analysis Token setPositionIncrement

List of usage examples for org.apache.lucene.analysis Token setPositionIncrement

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token setPositionIncrement.

Prototype

@Override
public void setPositionIncrement(int positionIncrement) 

Source Link

Usage

From source file:analysis.StandardTokenizer.java

License:Apache License

@SuppressWarnings("deprecation")
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    int posIncr = 1;
    Token result = reusableToken;//from  w  w w  .ja va  2 s.co  m
    if (tokenList.size() > 0)
        return tokenList.remove();
    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerImpl.YYEOF) {
            return null;
        }

        if (scanner.yylength() <= maxTokenLength) {
            reusableToken.clear();
            reusableToken.setPositionIncrement(posIncr);
            scanner.getText(reusableToken);
            final int start = scanner.yychar();
            reusableToken.setStartOffset(start);
            reusableToken.setEndOffset(start + reusableToken.termLength());
            // This 'if' should be removed in the next release. For now, it
            // converts
            // invalid acronyms to HOST. When removed, only the 'else' part
            // should
            // remain.
            if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
                if (replaceInvalidAcronym) {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                    reusableToken.setTermLength(reusableToken.termLength() - 1); // remove
                    // extra
                    // '.'
                    tokenType = StandardTokenizerImpl.HOST;
                } else {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                    tokenType = StandardTokenizerImpl.ACRONYM;
                }
            } else {

                reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            }

            if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM
                    || tokenType == StandardTokenizerImpl.ALPHANUM) {

                Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term()));
                Token tk = null;
                int st = reusableToken.startOffset();
                final Token token = new Token();
                while ((tk = lt.next(token)) != null) {
                    tk.setStartOffset(tk.startOffset() + st);
                    tk.setEndOffset(tk.endOffset() + st);
                    tk.setType(reusableToken.type());
                    tokenList.add((Token) tk.clone());
                }
            }
            if (tokenList.size() > 0)
                result = tokenList.remove();

            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:com.globalsight.ling.lucene.analysis.pl.PolishFilter.java

License:Apache License

/** Returns the next input Token, after being stemmed */
public final Token next() throws IOException {
    Token token = getNextToken();//w ww .  j  a va2  s .c o m

    if (token == null) {
        return null;
    } else {
        String s = stemmer.stem(token.toString(), true);

        if (!s.equals(token.toString())) {
            // reconstruct the input token. This is silly...
            Token res = new Token(s, token.startOffset(), token.endOffset(), token.type());
            res.setPositionIncrement(token.getPositionIncrement());
            return res;
        }

        return token;
    }
}

From source file:com.mhs.qsol.proximity.ProximityVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query./*from w w  w . ja  v  a2  s.c o m*/
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {
    if (logger.isLoggable(Level.FINE)) {
        // logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    if (logger.isLoggable(Level.FINE)) {
        logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        return null;
    } else if (v.size() == 1) {
        t = v.get(0);
        SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        stq.setBoost(this.boost);
        return stq;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                SpanQuery[] spanQueries = new SpanQuery[v.size()];

                StringBuilder regex = new StringBuilder();

                for (int i = 0; i < v.size(); i++) {
                    spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString()));
                }

                return new SpanOrQuery(spanQueries);
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length())));
            }

            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.mhs.qsol.QsolToQueryVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query./* w ww. j  av a  2s.  com*/
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        // null's will get cleaned up in visitBooleanOp
        return null;
    } else if (v.size() == 1) {

        t = v.get(0);

        TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        termQuery.setBoost(this.boost);

        return termQuery;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = new BooleanQuery(true);

                for (int i = 0; i < v.size(); i++) {
                    t = v.get(i);

                    TermQuery currentQuery = new TermQuery(
                            new Term(field, new String(t.buffer(), 0, t.length())));
                    currentQuery.setBoost(this.boost);

                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }

                return q;
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                SpanTermQuery spanQuery = new SpanTermQuery(
                        new Term(field, new String(t2.buffer(), 0, t2.length())));
                spanQuery.setBoost(boost);
                clauses[i] = spanQuery;
            }

            // Note: There's a bug here (not by me) that where term offsets are not respected.
            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.stimulus.archiva.search.EmailFilter.java

License:Open Source License

private void putPart(Token token) throws IOException {
    String emailAddress = token.termText();
    emailAddress = emailAddress.replaceAll("<", "");
    emailAddress = emailAddress.replaceAll(">", "");
    emailAddress = emailAddress.replaceAll("\"", "");

    String[] parts = extractEmailParts(emailAddress);

    String partout = "";
    for (int i = 0; i < parts.length; i++) {
        partout += parts[i] + "_";
        if (parts[i] != null) {
            Token subToken = new Token(parts[i].trim(), token.startOffset(), token.endOffset());
            subToken.setPositionIncrement(0);
            emailTokenStack.push(subToken);
        }/*ww w.j  a  v  a  2s .  co m*/
    }
}

From source file:edu.mit.ll.vizlinc.highlight.TokenStreamFromTermPositionVector.java

License:Apache License

/**
 * Constructor./*from w w  w. j av a  2 s.  c  om*/
 * 
 * @param termPositionVector TermPositionVector that contains the data for
 *        creating the TokenStream. Must have positions and offsets.
 */
public TokenStreamFromTermPositionVector(final TermPositionVector termPositionVector) {
    termAttribute = addAttribute(CharTermAttribute.class);
    positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
    offsetAttribute = addAttribute(OffsetAttribute.class);
    final String[] terms = termPositionVector.getTerms();
    for (int i = 0; i < terms.length; i++) {
        final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i);
        final int[] termPositions = termPositionVector.getTermPositions(i);
        for (int j = 0; j < termPositions.length; j++) {
            Token token;
            if (offsets != null) {
                token = new Token(terms[i].toCharArray(), 0, terms[i].length(), offsets[j].getStartOffset(),
                        offsets[j].getEndOffset());
            } else {
                token = new Token();
                token.setEmpty().append(terms[i]);
            }
            // Yes - this is the position, not the increment! This is for
            // sorting. This value
            // will be corrected before use.
            token.setPositionIncrement(termPositions[j]);
            this.positionedTokens.add(token);
        }
    }
    CollectionUtil.mergeSort(this.positionedTokens, tokenComparator);
    int lastPosition = -1;
    for (final Token token : this.positionedTokens) {
        int thisPosition = token.getPositionIncrement();
        token.setPositionIncrement(thisPosition - lastPosition);
        lastPosition = thisPosition;
    }
    this.tokensAtCurrentPosition = this.positionedTokens.iterator();
}

From source file:edu.uci.ics.sourcerer.search.analysis.CamelCaseSplitFilter.java

License:Open Source License

protected void decompose(Token token) {

    int start = 0, end = 0;

    char[] buffer = token.termBuffer();

    int i = 0;//from ww  w  . ja va2s .c o m
    for (; i < token.termLength(); i++) {

        // only compare two consecutive letters
        if (i > 0 && Character.isLetter(buffer[i]) && Character.isLetter(buffer[i - 1])) {

            // lower -> upper
            if (Character.isLowerCase(buffer[i - 1]) && Character.isUpperCase(buffer[i])) {

                // create a new token upto buffer[i-1]
                tokens.add(newTok(token, start, end + 1));
                start = i;
                // end = i;

            }
            // upper -> lower
            else if (Character.isUpperCase(buffer[i - 1]) && Character.isLowerCase(buffer[i])) {

                if (start < i - 1) {
                    // create a new token upto buffer[i-2]
                    tokens.add(newTok(token, start, i - 1));

                    // also go back and check for consecutive Upper letters
                    // and create a token if more than one found
                    // URIs will produce UR, URI, Is
                    // DBPool will produce DB, DBP, Pool 
                    if (Character.isLetter(buffer[i - 2]) && Character.isUpperCase(buffer[i - 2])) {
                        Token _tok = newTok(token, start, i);
                        _tok.setPositionIncrement(0);
                        tokens.add(_tok);
                    }

                    start = i - 1;

                }
            } // end upper -< lower
        } // end compare consecutive letters

        end = i;
    } // end for

    if (start == 0 && end == i - 1) {
        // no camel case found 
    } else {
        // add the last token
        tokens.add(newTok(token, start, end + 1));
    }

}

From source file:edu.uci.ics.sourcerer.search.analysis.LetterDigitSplitFilter.java

License:Open Source License

protected void decompose(Token token) {

    int start = 0, end = 0;

    char[] buffer = token.termBuffer();

    int i = 0;/*from  w ww . ja  v  a 2s.c  o m*/
    for (; i < token.termLength(); i++) {

        if (i > 0) {

            if (// letter -> digit
            (Character.isLetter(buffer[i - 1]) && Character.isDigit(buffer[i])) ||
            // digit -> letter
                    (Character.isDigit(buffer[i - 1]) && Character.isLetter(buffer[i]))) {

                // create a new token upto buffer[i-1]
                tokens.add(newTok(token, start, end + 1));
                start = i;
                end = i;

            }
        }

        end = i;
    } // end for

    if (start == 0 && end == i - 1) {
        // either all numbers or letters
    } else {
        // add the last token
        tokens.add(newTok(token, start, end + 1));

        if (preserveOriginal > 0) {
            Token _tok = newTok(token, 0, token.termLength());
            _tok.setPositionIncrement(0);
            tokens.add(_tok);
        }
    }

}

From source file:edu.usu.cosl.analysis.es.SpanishPorterFilterFactory.java

License:Apache License

@Override
public Token next() throws IOException {
    Token tok = input.next();/*from   ww w .j  a va 2 s . c  o m*/
    if (tok == null)
        return null;
    String tokstr = tok.termText();

    // if protected, don't stem.  use this to avoid stemming collisions.
    if (protWords != null && protWords.contains(tokstr)) {
        return tok;
    }

    stemmer.setCurrent(tokstr);
    stemmer.stem();
    String newstr = stemmer.getCurrent();
    if (tokstr.equals(newstr)) {
        return tok;
    } else {
        // TODO: it would be nice if I could just set termText directly like
        // lucene packages can.
        Token newtok = new Token(newstr, tok.startOffset(), tok.endOffset(), tok.type());
        newtok.setPositionIncrement(tok.getPositionIncrement());
        return newtok;
    }

}

From source file:gpl.pierrick.brihaye.aramorph.lucene.ArabicGlosser.java

License:Open Source License

/** Returns the next gloss for the given stem.
 * @param firstOne Whether or not this gloss is the first one
 * @return The gloss. Its <CODE>termText</CODE> is the gloss of the <STRONG>stem</STRONG>. Its <CODE>type</CODE> is the grammatical category of the <STRONG>stem</STRONG>.
 * When several glosses are available, every emitted token's
 * <CODE>PositionIncrement</CODE> but the first one is set to <CODE>0</CODE>
 * @see org.apache.lucene.analysis.Token#setPositionIncrement(int)
 *//*w  w  w .  j a v a 2 s  . c  o m*/
private Token nextGloss(boolean firstOne) {
    Token emittedToken = null;
    String tokenText = null;
    String tokenType = null;
    try {
        tokenText = (String) tokenGlosses.getFirst();
        if (tokenText == null)
            tokenText = "";
        //Token is typed in order to filter it later         
        tokenType = (String) tokenPOS.getFirst();
        if (tokenType == null)
            tokenType = "NO_STEM";
        //OK : we're done with this gloss
        tokenGlosses.removeFirst();
        tokenPOS.removeFirst();
        //Will there be further treatment for this token ?
        processingToken = !tokenGlosses.isEmpty();
    }
    //It should not be normally possible !
    catch (IndexOutOfBoundsException e) {
        System.err.println("Something went wrong in nextGloss");
        processingToken = false;
        //Re-emit the same token text (romanized) : not the best solution :-(
        tokenText = romanizedToken;
        tokenType = "PLACE_HOLDER";
    }
    emittedToken = new Token(tokenText, receivedToken.startOffset(), receivedToken.endOffset(), tokenType);
    if (!firstOne)
        emittedToken.setPositionIncrement(0);
    if (debug)
        System.out.println(
                emittedToken.termText() + "\t" + emittedToken.type() + "\t" + "[" + emittedToken.startOffset()
                        + "-" + emittedToken.endOffset() + "]" + "\t" + emittedToken.getPositionIncrement());
    return emittedToken;
}