Example usage for org.apache.lucene.analysis Token buffer

List of usage examples for org.apache.lucene.analysis Token buffer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token buffer.

Prototype

@Override
    public final char[] buffer() 

Source Link

Usage

From source file:com.mhs.qsol.proximity.ProximityVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query./*from w  ww. j a  va 2s  .  c om*/
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {
    if (logger.isLoggable(Level.FINE)) {
        // logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    if (logger.isLoggable(Level.FINE)) {
        logger.fine("Query tokenToQuery(String token) : token:" + token);
    }

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        return null;
    } else if (v.size() == 1) {
        t = v.get(0);
        SpanTermQuery stq = new SpanTermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        stq.setBoost(this.boost);
        return stq;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                SpanQuery[] spanQueries = new SpanQuery[v.size()];

                StringBuilder regex = new StringBuilder();

                for (int i = 0; i < v.size(); i++) {
                    spanQueries[i] = new SpanTermQuery(new Term(field, regex.toString()));
                }

                return new SpanOrQuery(spanQueries);
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                clauses[i] = new SpanTermQuery(new Term(field, new String(t2.buffer(), 0, t2.length())));
            }

            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.mhs.qsol.QsolToQueryVisitor.java

License:Apache License

/**
 * Converts a token, as defined in the qsol.jtb JavaCC file, into an
 * appropriate query./*from  www  . j a va  2 s . c  om*/
 * 
 * @param token
 * @return
 */
protected Query tokenToQuery(String token) {

    token = removeEscapeChars(token);

    TokenStream source = analyzer.tokenStream(field, new StringReader(token));
    ArrayList<Token> v = new ArrayList<Token>();
    Token t;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    CharTermAttribute charTermAtrib = source.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtrib = source.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncAtt = source.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        try {
            if (!source.incrementToken()) {
                break;
            }
            t = new Token(charTermAtrib.buffer(), 0, charTermAtrib.length(), offsetAtrib.startOffset(),
                    offsetAtrib.endOffset());
            t.setPositionIncrement(posIncAtt.getPositionIncrement());
        } catch (IOException e) {
            t = null;
        }

        if (t == null) {
            break;
        }

        v.add(t);

        if (t.getPositionIncrement() != 0) {
            positionCount += t.getPositionIncrement();
        } else {
            severalTokensAtSamePosition = true;
        }
    }

    try {
        source.close();
    } catch (IOException e) {
        // ignore
    }

    if (v.size() == 0) {
        // null's will get cleaned up in visitBooleanOp
        return null;
    } else if (v.size() == 1) {

        t = v.get(0);

        TermQuery termQuery = new TermQuery(new Term(field, new String(t.buffer(), 0, t.length())));
        termQuery.setBoost(this.boost);

        return termQuery;
    } else {
        if (severalTokensAtSamePosition) {
            if (positionCount == 1) {
                // no phrase query:
                BooleanQuery q = new BooleanQuery(true);

                for (int i = 0; i < v.size(); i++) {
                    t = v.get(i);

                    TermQuery currentQuery = new TermQuery(
                            new Term(field, new String(t.buffer(), 0, t.length())));
                    currentQuery.setBoost(this.boost);

                    q.add(currentQuery, BooleanClause.Occur.SHOULD);
                }

                return q;
            } else {
                // All the Tokens in each sub-list are positioned at the the same location.
                ArrayList<ArrayList<Token>> identicallyPositionedTokenLists = new ArrayList<ArrayList<Token>>();
                for (int i = 0; i < v.size(); i++) {
                    if ((i == 0) || (v.get(i).getPositionIncrement() > 0)) {
                        identicallyPositionedTokenLists.add(new ArrayList<Token>());
                    }
                    ArrayList<Token> curList = identicallyPositionedTokenLists
                            .get(identicallyPositionedTokenLists.size() - 1);
                    curList.add(v.get(i));
                }

                ArrayList<SpanQuery> spanNearSubclauses = new ArrayList<SpanQuery>();
                for (int listNum = 0; listNum < identicallyPositionedTokenLists.size(); listNum++) {
                    ArrayList<Token> curTokens = identicallyPositionedTokenLists.get(listNum);

                    ArrayList<SpanTermQuery> curTermQueries = new ArrayList<SpanTermQuery>();
                    for (int tokenNum = 0; tokenNum < curTokens.size(); tokenNum++) {
                        SpanTermQuery termQuery = new SpanTermQuery(
                                new Term(field, curTokens.get(tokenNum).term()));
                        termQuery.setBoost(this.boost);
                        curTermQueries.add(termQuery);
                    }

                    int size = curTermQueries.size();
                    if (size <= 0)
                        continue;
                    else if (size == 1)
                        spanNearSubclauses.add(curTermQueries.get(0));
                    else
                        spanNearSubclauses.add(new SpanOrQuery(curTermQueries.toArray(new SpanQuery[0])));
                }

                SpanNearQuery query = new SpanNearQuery(
                        (SpanQuery[]) spanNearSubclauses.toArray(new SpanQuery[0]), slop, true);

                return query;
            }
        } else {
            SpanTermQuery[] clauses = new SpanTermQuery[v.size()];

            for (int i = 0; i < v.size(); i++) {
                Token t2 = v.get(i);
                SpanTermQuery spanQuery = new SpanTermQuery(
                        new Term(field, new String(t2.buffer(), 0, t2.length())));
                spanQuery.setBoost(boost);
                clauses[i] = spanQuery;
            }

            // Note: There's a bug here (not by me) that where term offsets are not respected.
            SpanNearQuery query = new SpanNearQuery(clauses, slop, true);

            return query;
        }
    }
}

From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java

License:Open Source License

private Token nextToken(Token reusableToken) throws IOException {
    assert reusableToken != null;

    // ?/*from  w ww . java 2 s .  c o m*/
    Token nextToken = tokenQueue.poll();
    if (nextToken != null) {
        return nextToken;
    }

    /*
     * // TokenUtils.nextToken ? inc if(!input.incrementToken()) { return null; }
     */

    /*
     * TermAttribute termAtt = (TermAttribute)input.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt =
     * (OffsetAttribute)input.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt =
     * (TypeAttribute)input.getAttribute(TypeAttribute.class); nextToken =
     * reusableToken.reinit(termAtt.termBuffer(), 0, termAtt.termLength(), offsetAtt.startOffset(),
     * offsetAtt.endOffset(), typeAtt.type());
     */

    nextToken = TokenUtils.nextToken(input, reusableToken);

    if (nextToken != null && (MMSegWord.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type())
            || MMSegWord.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type()))) {
        final char[] buffer = nextToken.buffer();
        final int length = nextToken.length();
        byte lastType = (byte) Character.getType(buffer[0]); // ??
        int termBufferOffset = 0;
        int termBufferLength = 0;
        for (int i = 0; i < length; i++) {
            byte type = (byte) Character.getType(buffer[i]);
            if (type <= Character.MODIFIER_LETTER) {
                type = Character.LOWERCASE_LETTER;
            }
            if (type != lastType) { // ??
                addToken(nextToken, termBufferOffset, termBufferLength, lastType);

                termBufferOffset += termBufferLength;
                termBufferLength = 0;

                lastType = type;
            }

            termBufferLength++;
        }
        if (termBufferLength > 0) { // ?
            addToken(nextToken, termBufferOffset, termBufferLength, lastType);
        }
        nextToken = tokenQueue.poll();
    }

    return nextToken;
}

From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java

License:Open Source License

private void addToken(Token oriToken, int termBufferOffset, int termBufferLength, byte type) {
    Token token = new Token(oriToken.buffer(), termBufferOffset, termBufferLength,
            oriToken.startOffset() + termBufferOffset,
            oriToken.startOffset() + termBufferOffset + termBufferLength);

    if (type == Character.DECIMAL_DIGIT_NUMBER) {
        token.setType(MMSegWord.TYPE_DIGIT);
    } else {// w  ww  . j  av a2 s.c om
        token.setType(MMSegWord.TYPE_LETTER);
    }

    tokenQueue.offer(token);
}

From source file:com.zb.mmseg.analysis.CutLetterDigitFilter.java

License:Open Source License

public final boolean incrementToken() throws IOException {
    clearAttributes();//w  w  w. ja  v  a 2s  .  com
    Token token = nextToken(reusableToken);
    if (token != null) {
        termAtt.copyBuffer(token.buffer(), 0, token.length());
        offsetAtt.setOffset(token.startOffset(), token.endOffset());
        typeAtt.setType(token.type());
        return true;
    } else {
        end();
        return false;
    }
}

From source file:org.apache.solr.analysis.BufferedTokenStream.java

License:Apache License

/** old api emulation for back compat */
private boolean writeToken(Token token) throws IOException {
    clearAttributes();//from   w  w w .jav a2s. c o  m
    termAtt.copyBuffer(token.buffer(), 0, token.length());
    offsetAtt.setOffset(token.startOffset(), token.endOffset());
    typeAtt.setType(token.type());
    flagsAtt.setFlags(token.getFlags());
    posIncAtt.setPositionIncrement(token.getPositionIncrement());
    payloadAtt.setPayload(token.getPayload());
    return true;
}

From source file:org.apache.solr.analysis.SlowSynonymFilter.java

License:Apache License

@Override
public boolean incrementToken() throws IOException {
    while (true) {
        // if there are any generated tokens, return them... don't try any
        // matches against them, as we specifically don't want recursion.
        if (replacement != null && replacement.hasNext()) {
            copy(this, replacement.next());
            return true;
        }//from  w  ww . j  av  a 2  s  .c o  m

        // common case fast-path of first token not matching anything
        AttributeSource firstTok = nextTok();
        if (firstTok == null)
            return false;
        CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class);
        SlowSynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length())
                : null;
        if (result == null) {
            copy(this, firstTok);
            return true;
        }

        // fast-path failed, clone ourselves if needed
        if (firstTok == this)
            firstTok = cloneAttributes();
        // OK, we matched a token, so find the longest match.

        matched = new LinkedList<AttributeSource>();

        result = match(result);

        if (result == null) {
            // no match, simply return the first token read.
            copy(this, firstTok);
            return true;
        }

        // reuse, or create new one each time?
        ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(
                result.synonyms.length + matched.size() + 1);

        //
        // there was a match... let's generate the new tokens, merging
        // in the matched tokens (position increments need adjusting)
        //
        AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
        boolean includeOrig = result.includeOrig();

        AttributeSource origTok = includeOrig ? firstTok : null;
        PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class);
        int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream
        int repPos = 0; // curr position in replacement token stream
        int pos = 0; // current position in merged token stream

        for (int i = 0; i < result.synonyms.length; i++) {
            Token repTok = result.synonyms[i];
            AttributeSource newTok = firstTok.cloneAttributes();
            CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class);
            OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class);
            PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class);

            OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);

            newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
            newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
            repPos += repTok.getPositionIncrement();
            if (i == 0)
                repPos = origPos; // make position of first token equal to original

            // if necessary, insert original tokens and adjust position increment
            while (origTok != null && origPos <= repPos) {
                PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                origPosInc.setPositionIncrement(origPos - pos);
                generated.add(origTok);
                pos += origPosInc.getPositionIncrement();
                origTok = matched.isEmpty() ? null : matched.removeFirst();
                if (origTok != null) {
                    origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                    origPos += origPosInc.getPositionIncrement();
                }
            }

            newPosIncAtt.setPositionIncrement(repPos - pos);
            generated.add(newTok);
            pos += newPosIncAtt.getPositionIncrement();
        }

        // finish up any leftover original tokens
        while (origTok != null) {
            PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
            origPosInc.setPositionIncrement(origPos - pos);
            generated.add(origTok);
            pos += origPosInc.getPositionIncrement();
            origTok = matched.isEmpty() ? null : matched.removeFirst();
            if (origTok != null) {
                origPosInc = origTok.addAttribute(PositionIncrementAttribute.class);
                origPos += origPosInc.getPositionIncrement();
            }
        }

        // what if we replaced a longer sequence with a shorter one?
        // a/0 b/5 =>  foo/0
        // should I re-create the gap on the next buffered token?

        replacement = generated.iterator();
        // Now return to the top of the loop to read and return the first
        // generated token.. The reason this is done is that we may have generated
        // nothing at all, and may need to continue with more matching logic.
    }
}

From source file:org.apache.solr.analysis.SlowSynonymMap.java

License:Apache License

/**
 * Merge two lists of tokens, producing a single list with manipulated positionIncrements so that
 * the tokens end up at the same position.
 *
 * Example:  [a b] merged with [c d] produces [a/b c/d]  ('/' denotes tokens in the same position)
 * Example:  [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2]  (a,n means a has posInc=n)
 *
 *///from ww w .  ja va2 s  .  c o m
public static List<Token> mergeTokens(List<Token> lst1, List<Token> lst2) {
    ArrayList<Token> result = new ArrayList<Token>();
    if (lst1 == null || lst2 == null) {
        if (lst2 != null)
            result.addAll(lst2);
        if (lst1 != null)
            result.addAll(lst1);
        return result;
    }

    int pos = 0;
    Iterator<Token> iter1 = lst1.iterator();
    Iterator<Token> iter2 = lst2.iterator();
    Token tok1 = iter1.hasNext() ? iter1.next() : null;
    Token tok2 = iter2.hasNext() ? iter2.next() : null;
    int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
    int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
    while (tok1 != null || tok2 != null) {
        while (tok1 != null && (pos1 <= pos2 || tok2 == null)) {
            Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
            tok.copyBuffer(tok1.buffer(), 0, tok1.length());
            tok.setPositionIncrement(pos1 - pos);
            result.add(tok);
            pos = pos1;
            tok1 = iter1.hasNext() ? iter1.next() : null;
            pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
        }
        while (tok2 != null && (pos2 <= pos1 || tok1 == null)) {
            Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
            tok.copyBuffer(tok2.buffer(), 0, tok2.length());
            tok.setPositionIncrement(pos2 - pos);
            result.add(tok);
            pos = pos2;
            tok2 = iter2.hasNext() ? iter2.next() : null;
            pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
        }
    }
    return result;
}

From source file:org.apache.solr.analysis.TestSynonymMap.java

License:Apache License

private void assertTokIncludes(SynonymMap map, String src, String exp) throws Exception {
    Token[] tokens = map.submap.get(src).synonyms;
    boolean inc = false;
    for (Token token : tokens) {
        if (exp.equals(new String(token.buffer(), 0, token.length())))
            inc = true;//from   ww w.j  a  va2  s  .  co m
    }
    assertTrue(inc);
}

From source file:org.apache.solr.handler.component.SpellCheckComponent.java

License:Apache License

protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery,
        boolean extendedResults, boolean collate, boolean correctlySpelled) {
    NamedList result = new NamedList();
    Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
    boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
    boolean hasSuggestions = false;
    boolean hasZeroFrequencyToken = false;
    for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
        Token inputToken = entry.getKey();
        String tokenString = new String(inputToken.buffer(), 0, inputToken.length());
        Map<String, Integer> theSuggestions = new LinkedHashMap<String, Integer>(entry.getValue());
        Iterator<String> sugIter = theSuggestions.keySet().iterator();
        while (sugIter.hasNext()) {
            String sug = sugIter.next();
            if (sug.equals(tokenString)) {
                sugIter.remove();/* ww w  .j  a  v  a  2  s.  c om*/
            }
        }
        if (theSuggestions.size() > 0) {
            hasSuggestions = true;
        }
        if (theSuggestions != null && (theSuggestions.size() > 0 || shardRequest)) {
            SimpleOrderedMap suggestionList = new SimpleOrderedMap();
            suggestionList.add("numFound", theSuggestions.size());
            suggestionList.add("startOffset", inputToken.startOffset());
            suggestionList.add("endOffset", inputToken.endOffset());

            // Logical structure of normal (non-extended) results:
            // "suggestion":["alt1","alt2"]
            //
            // Logical structure of the extended results:
            // "suggestion":[
            // {"word":"alt1","freq":7},
            // {"word":"alt2","freq":4}
            // ]
            if (extendedResults && hasFreqInfo) {
                suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken));

                ArrayList<SimpleOrderedMap> sugs = new ArrayList<SimpleOrderedMap>();
                suggestionList.add("suggestion", sugs);
                for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) {
                    SimpleOrderedMap sugEntry = new SimpleOrderedMap();
                    sugEntry.add("word", suggEntry.getKey());
                    sugEntry.add("freq", suggEntry.getValue());
                    sugs.add(sugEntry);
                }
            } else {
                suggestionList.add("suggestion", theSuggestions.keySet());
            }

            if (hasFreqInfo) {
                int tokenFrequency = spellingResult.getTokenFrequency(inputToken);
                if (tokenFrequency == 0) {
                    hasZeroFrequencyToken = true;
                }
            }
            result.add(tokenString, suggestionList);
        }
    }

    if (extendedResults) {
        result.add("correctlySpelled", correctlySpelled);
    }
    return result;
}