Example usage for org.apache.lucene.analysis Token setPositionLength

List of usage examples for org.apache.lucene.analysis Token setPositionLength

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token setPositionLength.

Prototype

@Override
public void setPositionLength(int positionLength) 

Source Link

Usage

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.JapaneseTokenizer.java

License:Apache License

private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
    final int endPos = endPosData.pos;

    if (VERBOSE) {
        System.out.println("\n  backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos)
                + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
    }//from  w ww  . j av  a2s  .c o m

    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);

    if (dotOut != null) {
        dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
    }

    int pos = endPos;
    int bestIDX = fromIDX;
    Token altToken = null;

    // We trace backwards, so this will be the leftWordID of
    // the token after the one we are now on:
    int lastLeftWordID = -1;

    int backCount = 0;

    // TODO: sort of silly to make Token instances here; the
    // back trace has all info needed to generate the
    // token.  So, we could just directly set the attrs,
    // from the backtrace, in incrementToken w/o ever
    // creating Token; we'd have to defer calling freeBefore
    // until after the backtrace was fully "consumed" by
    // incrementToken.

    while (pos > lastBackTracePos) {
        //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
        final Position posData = positions.get(pos);
        assert bestIDX < posData.count;

        int backPos = posData.backPos[bestIDX];
        assert backPos >= lastBackTracePos : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
        int length = pos - backPos;
        Type backType = posData.backType[bestIDX];
        int backID = posData.backID[bestIDX];
        int nextBestIDX = posData.backIndex[bestIDX];

        if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {

            // In searchMode, if best path had picked a too-long
            // token, we use the "penalty" to compute the allowed
            // max cost of an alternate back-trace.  If we find an
            // alternate back trace with cost below that
            // threshold, we pursue it instead (but also output
            // the long token).
            //System.out.println("    2nd best backPos=" + backPos + " pos=" + pos);

            final int penalty = computeSecondBestThreshold(backPos, pos - backPos);

            if (penalty > 0) {
                if (VERBOSE) {
                    System.out.println("  compound=" + new String(buffer.get(backPos, pos - backPos))
                            + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost="
                            + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
                }

                // Use the penalty to set maxCost on the 2nd best
                // segmentation:
                int maxCost = posData.costs[bestIDX] + penalty;
                if (lastLeftWordID != -1) {
                    maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
                }

                // Now, prune all too-long tokens from the graph:
                pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]);

                // Finally, find 2nd best back-trace and resume
                // backtrace there:
                int leastCost = Integer.MAX_VALUE;
                int leastIDX = -1;
                for (int idx = 0; idx < posData.count; idx++) {
                    int cost = posData.costs[idx];
                    //System.out.println("    idx=" + idx + " prevCost=" + cost);

                    if (lastLeftWordID != -1) {
                        cost += costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
                                lastLeftWordID);
                        //System.out.println("      += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
                        //lastLeftWordID) + " -> " + cost);
                    }
                    //System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
                    //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
                    if (cost < leastCost) {
                        //System.out.println("      ** ");
                        leastCost = cost;
                        leastIDX = idx;
                    }
                }
                //System.out.println("  leastIDX=" + leastIDX);

                if (VERBOSE) {
                    System.out.println("  afterPrune: " + posData.count + " arcs arriving; leastCost="
                            + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
                }

                if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) {
                    // We should have pruned the altToken from the graph:
                    assert posData.backPos[leastIDX] != backPos;

                    // Save the current compound token, to output when
                    // this alternate path joins back:
                    altToken = new Token(backID, fragment, backPos - lastBackTracePos, length, backType,
                            backPos, getDict(backType));

                    // Redirect our backtrace to 2nd best:
                    bestIDX = leastIDX;
                    nextBestIDX = posData.backIndex[bestIDX];

                    backPos = posData.backPos[bestIDX];
                    length = pos - backPos;
                    backType = posData.backType[bestIDX];
                    backID = posData.backID[bestIDX];
                    backCount = 0;
                    //System.out.println("  do alt token!");

                } else {
                    // I think in theory it's possible there is no
                    // 2nd best path, which is fine; in this case we
                    // only output the compound token:
                    //System.out.println("  no alt token! bestIDX=" + bestIDX);
                }
            }
        }

        final int offset = backPos - lastBackTracePos;
        assert offset >= 0;

        if (altToken != null && altToken.getPosition() >= backPos) {

            // We've backtraced to the position where the
            // compound token starts; add it now:

            // The pruning we did when we created the altToken
            // ensures that the back trace will align back with
            // the start of the altToken:
            assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;

            // NOTE: not quite right: the compound token may
            // have had all punctuation back traced so far, but
            // then the decompounded token at this position is
            // not punctuation.  In this case backCount is 0,
            // but we should maybe add the altToken anyway...?

            if (backCount > 0) {
                backCount++;
                altToken.setPositionLength(backCount);
                if (VERBOSE) {
                    System.out.println("    add altToken=" + altToken);
                }
                pending.add(altToken);
            } else {
                // This means alt token was all punct tokens:
                if (VERBOSE) {
                    System.out.println("    discard all-punctuation altToken=" + altToken);
                }
                assert discardPunctuation;
            }
            altToken = null;
        }

        final com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.Dictionary dict = getDict(backType);

        if (backType == Type.USER) {

            // Expand the phraseID we recorded into the actual
            // segmentation:
            final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
            int wordID = wordIDAndLength[0];
            int current = 0;
            for (int j = 1; j < wordIDAndLength.length; j++) {
                final int len = wordIDAndLength[j];
                //System.out.println("    add user: len=" + len);
                pending.add(new Token(wordID + j - 1, fragment, current + offset, len, Type.USER,
                        current + backPos, dict));
                if (VERBOSE) {
                    System.out.println("    add USER token=" + pending.get(pending.size() - 1));
                }
                current += len;
            }

            // Reverse the tokens we just added, because when we
            // serve them up from incrementToken we serve in
            // reverse:
            Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size()));

            backCount += wordIDAndLength.length - 1;
        } else {

            if (extendedMode && backType == Type.UNKNOWN) {
                // In EXTENDED mode we convert unknown word into
                // unigrams:
                int unigramTokenCount = 0;
                for (int i = length - 1; i >= 0; i--) {
                    int charLen = 1;
                    if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) {
                        i--;
                        charLen = 2;
                    }
                    //System.out.println("    extended tok offset="
                    //+ (offset + i));
                    if (!discardPunctuation || !isPunctuation(fragment[offset + i])) {
                        pending.add(new Token(CharacterDefinition.NGRAM, fragment, offset + i, charLen,
                                Type.UNKNOWN, backPos + i, unkDictionary));
                        unigramTokenCount++;
                    }
                }
                backCount += unigramTokenCount;

            } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
                pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict));
                if (VERBOSE) {
                    System.out.println("    add token=" + pending.get(pending.size() - 1));
                }
                backCount++;
            } else {
                if (VERBOSE) {
                    System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
                }
            }
        }

        lastLeftWordID = dict.getLeftId(backID);
        pos = backPos;
        bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;

    if (VERBOSE) {
        System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
}

From source file:org.elasticsearch.analysis.common.FlattenGraphTokenFilterFactoryTests.java

License:Apache License

private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
    final Token t = new Token(term, startOffset, endOffset);
    t.setPositionIncrement(posInc);/* ww w .  j a va 2 s .  co  m*/
    t.setPositionLength(posLength);
    return t;
}