Example usage for org.apache.lucene.analysis.synonym SynonymMap WORD_SEPARATOR

List of usage examples for org.apache.lucene.analysis.synonym SynonymMap WORD_SEPARATOR

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.synonym SynonymMap WORD_SEPARATOR.

Prototype

char WORD_SEPARATOR

To view the source code for org.apache.lucene.analysis.synonym SynonymMap WORD_SEPARATOR.

Click Source Link

Document

for multiword support, you must separate words with this separator

Usage

From source file:com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymFilter.java

License:Apache License

private void parse() throws IOException {

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;//from  w w w .  j a  v a2  s .co  m
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken: while (true) {

        // Pull next token's chars:
        final char[] buffer;
        final int bufferLen;

        int inputEndOffset;

        if (curNextRead == nextWrite) {

            // We used up our lookahead buffer of input tokens
            // -- pull next real input token:
            if (finished) {
                break;
            } else {
                assert futureInputs[nextWrite].consumed;
                // Not correct: a syn match whose output is longer
                // than its input can set future inputs keepOrig
                // to true:
                if (input.incrementToken()) {
                    buffer = termAtt.buffer();
                    bufferLen = termAtt.length();
                    final PendingInput input = futureInputs[nextWrite];
                    lastStartOffset = input.startOffset = offsetAtt.startOffset();
                    lastEndOffset = input.endOffset = offsetAtt.endOffset();
                    inputEndOffset = input.endOffset;
                    if (nextRead != nextWrite) {
                        capture();
                    } else {
                        input.consumed = false;
                    }

                } else {
                    // No more input tokens
                    finished = true;
                    break;
                }
            }
        } else {
            // Still in our lookahead
            buffer = futureInputs[curNextRead].term.chars();
            bufferLen = futureInputs[curNextRead].term.length();
            inputEndOffset = futureInputs[curNextRead].endOffset;
        }

        tokenCount++;

        // Run each char in this token through the FST:
        int bufUpto = 0;
        while (bufUpto < bufferLen) {
            final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
            if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc,
                    scratchArc, fstReader) == null) {
                break byToken;
            }

            // Accum the output
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            bufUpto += Character.charCount(codePoint);
        }

        // OK, entire token matched; now see if this is a final
        // state:
        if (scratchArc.isFinal()) {
            matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
            matchInputLength = tokenCount;
            matchEndOffset = inputEndOffset;
        }

        // See if the FST wants to continue matching (ie, needs to
        // see the next input token):
        if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
            // No further rules can match here; we're done
            // searching for matching rules starting at the
            // current input position.
            break;
        } else {
            // More matching is possible -- accum the output (if
            // any) of the WORD_SEP arc:
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            if (nextRead == nextWrite) {
                capture();
            }
        }

        curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
        nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
        inputSkipCount = matchInputLength;
        addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
        // Even though we had no match here, we set to 1
        // because we need to skip current input token before
        // trying to match again:
        inputSkipCount = 1;
    } else {
        assert finished;
    }

}

From source file:com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymFilter.java

License:Apache License

private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    for (int outputIDX = 0; outputIDX < count; outputIDX++) {
        synonyms.words.get(bytesReader.readVInt(), scratchBytes);
        scratchChars.copyUTF8Bytes(scratchBytes);
        int lastStart = 0;
        final int chEnd = lastStart + scratchChars.length();
        int outputUpto = nextRead;
        for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
            if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
                final int outputLen = chIDX - lastStart;
                // Caller is not allowed to have empty string in
                // the output:
                assert outputLen > 0 : "output contains empty string: " + scratchChars;
                final int endOffset;
                final int posLen;
                if (chIDX == chEnd && lastStart == 0) {
                    // This rule had a single output token, so, we set
                    // this output's endOffset to the current
                    // endOffset (ie, endOffset of the last input
                    // token it matched):
                    endOffset = matchEndOffset;
                    posLen = keepOrig ? matchInputLength : 1;
                } else {
                    // This rule has more than one output token; we
                    // can't pick any particular endOffset for this
                    // case, so, we inherit the endOffset for the
                    // input token which this output overlaps:
                    endOffset = -1;/*  w ww . j  a  va2 s.  c o m*/
                    posLen = 1;
                }
                futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen);
                lastStart = 1 + chIDX;
                outputUpto = rollIncr(outputUpto);
                assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite="
                        + nextWrite;
            }
        }
    }

    int upto = nextRead;
    for (int idx = 0; idx < matchInputLength; idx++) {
        futureInputs[upto].keepOrig |= keepOrig;
        futureInputs[upto].matched = true;
        upto = rollIncr(upto);
    }
}

From source file:com.github.le11.nls.lucene.TypeAwareSynonymFilter.java

License:Apache License

private void parse() throws IOException {
    //System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;//  w w  w. j  a v a2s .co  m
    int matchInputLength = 0;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken: while (true) {

        // Pull next token's chars:
        final char[] buffer;
        final int bufferLen;
        //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

        if (curNextRead == nextWrite) {

            // We used up our lookahead buffer of input tokens
            // -- pull next real input token:

            if (finished) {
                break;
            } else {
                //System.out.println("  input.incrToken");
                assert futureInputs[nextWrite].consumed;
                // Not correct: a syn match whose output is longer
                // than its input can set future inputs keepOrig
                // to true:
                //assert !futureInputs[nextWrite].keepOrig;
                if (input.incrementToken()) {
                    buffer = termAtt.buffer();
                    bufferLen = termAtt.length();
                    final PendingInput input = futureInputs[nextWrite];
                    input.startOffset = offsetAtt.startOffset();
                    input.endOffset = offsetAtt.endOffset();
                    //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                    if (nextRead != nextWrite) {
                        capture();
                    } else {
                        input.consumed = false;
                    }

                } else {
                    // No more input tokens
                    //System.out.println("      set end");
                    finished = true;
                    break;
                }
            }
        } else {
            // Still in our lookahead
            buffer = futureInputs[curNextRead].term.chars;
            bufferLen = futureInputs[curNextRead].term.length;
            //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
        }

        tokenCount++;

        // Run each char in this token through the FST:
        int bufUpto = 0;
        while (bufUpto < bufferLen) {
            final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
            if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc,
                    scratchArc) == null) {
                //System.out.println("    stop");
                break byToken;
            }

            // Accum the output
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
            bufUpto += Character.charCount(codePoint);
        }

        // OK, entire token matched; now see if this is a final
        // state:
        if (scratchArc.isFinal()) {
            matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
            matchInputLength = tokenCount;
            //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
        }

        // See if the FST wants to continue matching (ie, needs to
        // see the next input token):
        if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
            // No further rules can match here; we're done
            // searching for matching rules starting at the
            // current input position.
            break;
        } else {
            // More matching is possible -- accum the output (if
            // any) of the WORD_SEP arc:
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            if (nextRead == nextWrite) {
                capture();
            }
        }

        curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
        //System.out.println("  skip write slot=" + nextWrite);
        nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
        //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
        inputSkipCount = matchInputLength;
        addOutput(matchOutput, matchInputLength);
    } else if (nextRead != nextWrite) {
        // Even though we had no match here, we set to 1
        // because we need to skip current input token before
        // trying to match again:
        inputSkipCount = 1;
    } else {
        assert finished;
    }

    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
}

From source file:com.github.le11.nls.lucene.TypeAwareSynonymFilter.java

License:Apache License

private void addOutput(BytesRef bytes, int matchInputLength) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
    for (int outputIDX = 0; outputIDX < count; outputIDX++) {
        synonyms.words.get(bytesReader.readVInt(), scratchBytes);
        //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
        UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
        int lastStart = scratchChars.offset;
        final int chEnd = lastStart + scratchChars.length;
        int outputUpto = nextRead;
        for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
            if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
                final int outputLen = chIDX - lastStart;
                // Caller is not allowed to have empty string in
                // the output:
                assert outputLen > 0 : "output contains empty string: " + scratchChars;
                futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
                //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
                lastStart = 1 + chIDX;//from www.ja  va 2s .  co  m
                //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
                outputUpto = rollIncr(outputUpto);
                assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite="
                        + nextWrite;
            }
        }
    }

    int upto = nextRead;
    for (int idx = 0; idx < matchInputLength; idx++) {
        futureInputs[upto].keepOrig |= keepOrig;
        futureInputs[upto].matched = true;
        upto = rollIncr(upto);
    }
}

From source file:org.codelibs.elasticsearch.synonym.analysis.SynonymFilter.java

License:Apache License

private void parse() throws IOException {
    //System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;/* w w w. j  a  va  2s.c  o  m*/
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken: while (true) {

        // Pull next token's chars:
        final char[] buffer;
        final int bufferLen;
        //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

        int inputEndOffset = 0;

        if (curNextRead == nextWrite) {

            // We used up our lookahead buffer of input tokens
            // -- pull next real input token:

            if (finished) {
                break;
            } else {
                //System.out.println("  input.incrToken");
                assert futureInputs[nextWrite].consumed;
                // Not correct: a syn match whose output is longer
                // than its input can set future inputs keepOrig
                // to true:
                //assert !futureInputs[nextWrite].keepOrig;
                if (input.incrementToken()) {
                    buffer = termAtt.buffer();
                    bufferLen = termAtt.length();
                    final PendingInput input = futureInputs[nextWrite];
                    lastStartOffset = input.startOffset = offsetAtt.startOffset();
                    lastEndOffset = input.endOffset = offsetAtt.endOffset();
                    inputEndOffset = input.endOffset;
                    //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                    if (nextRead != nextWrite) {
                        capture();
                    } else {
                        input.consumed = false;
                    }

                } else {
                    // No more input tokens
                    //System.out.println("      set end");
                    finished = true;
                    break;
                }
            }
        } else {
            // Still in our lookahead
            buffer = futureInputs[curNextRead].term.chars();
            bufferLen = futureInputs[curNextRead].term.length();
            inputEndOffset = futureInputs[curNextRead].endOffset;
            //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
        }

        tokenCount++;

        // Run each char in this token through the FST:
        int bufUpto = 0;
        while (bufUpto < bufferLen) {
            final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
            if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc,
                    scratchArc, fstReader) == null) {
                //System.out.println("    stop");
                break byToken;
            }

            // Accum the output
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
            bufUpto += Character.charCount(codePoint);
        }

        // OK, entire token matched; now see if this is a final
        // state:
        if (scratchArc.isFinal()) {
            matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
            matchInputLength = tokenCount;
            matchEndOffset = inputEndOffset;
            //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
        }

        // See if the FST wants to continue matching (ie, needs to
        // see the next input token):
        if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
            // No further rules can match here; we're done
            // searching for matching rules starting at the
            // current input position.
            break;
        } else {
            // More matching is possible -- accum the output (if
            // any) of the WORD_SEP arc:
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            if (nextRead == nextWrite) {
                capture();
            }
        }

        curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
        //System.out.println("  skip write slot=" + nextWrite);
        nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
        //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
        inputSkipCount = matchInputLength;
        addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
        // Even though we had no match here, we set to 1
        // because we need to skip current input token before
        // trying to match again:
        inputSkipCount = 1;
    } else {
        assert finished;
    }

    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
}

From source file:org.codelibs.elasticsearch.synonym.analysis.SynonymFilter.java

License:Apache License

private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
    for (int outputIDX = 0; outputIDX < count; outputIDX++) {
        synonyms.words.get(bytesReader.readVInt(), scratchBytes);
        //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
        scratchChars.copyUTF8Bytes(scratchBytes);
        int lastStart = 0;
        final int chEnd = lastStart + scratchChars.length();
        int outputUpto = nextRead;
        for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
            if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
                final int outputLen = chIDX - lastStart;
                // Caller is not allowed to have empty string in
                // the output:
                assert outputLen > 0 : "output contains empty string: " + scratchChars;
                final int endOffset;
                final int posLen;
                if (chIDX == chEnd && lastStart == 0) {
                    // This rule had a single output token, so, we set
                    // this output's endOffset to the current
                    // endOffset (ie, endOffset of the last input
                    // token it matched):
                    endOffset = matchEndOffset;
                    posLen = keepOrig ? matchInputLength : 1;
                } else {
                    // This rule has more than one output token; we
                    // can't pick any particular endOffset for this
                    // case, so, we inherit the endOffset for the
                    // input token which this output overlaps:
                    endOffset = -1;//from  ww w. ja v  a2s.co m
                    posLen = 1;
                }
                futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen);
                //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
                lastStart = 1 + chIDX;
                //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
                outputUpto = rollIncr(outputUpto);
                assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite="
                        + nextWrite;
            }
        }
    }

    int upto = nextRead;
    for (int idx = 0; idx < matchInputLength; idx++) {
        futureInputs[upto].keepOrig |= keepOrig;
        futureInputs[upto].matched = true;
        upto = rollIncr(upto);
    }
}

From source file:org.elasticsearch.synonym.DynamicSynonymFilter.java

License:Apache License

private void parse() throws IOException {
    //System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;//from  w ww .j  av  a2s. c  om
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken: while (true) {

        // Pull next token's chars:
        final char[] buffer;
        final int bufferLen;
        //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

        int inputEndOffset = 0;

        if (curNextRead == nextWrite) {

            // We used up our lookahead buffer of input tokens
            // -- pull next real input token:

            if (finished) {
                break;
            } else {
                //System.out.println("  input.incrToken");
                assert futureInputs[nextWrite].consumed;
                // Not correct: a syn match whose output is longer
                // than its input can set future inputs keepOrig
                // to true:
                //assert !futureInputs[nextWrite].keepOrig;
                if (input.incrementToken()) {
                    buffer = termAtt.buffer();
                    bufferLen = termAtt.length();
                    final PendingInput input = futureInputs[nextWrite];
                    lastStartOffset = input.startOffset = offsetAtt.startOffset();
                    lastEndOffset = input.endOffset = offsetAtt.endOffset();
                    inputEndOffset = input.endOffset;
                    //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                    if (nextRead != nextWrite) {
                        capture();
                    } else {
                        input.consumed = false;
                    }

                } else {
                    // No more input tokens
                    //System.out.println("      set end");
                    finished = true;
                    break;
                }
            }
        } else {
            // Still in our lookahead
            buffer = futureInputs[curNextRead].term.chars;
            bufferLen = futureInputs[curNextRead].term.length;
            inputEndOffset = futureInputs[curNextRead].endOffset;
            //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
        }

        tokenCount++;

        // Run each char in this token through the FST:
        int bufUpto = 0;
        while (bufUpto < bufferLen) {
            final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
            if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc,
                    scratchArc, fstReader) == null) {
                //System.out.println("    stop");
                break byToken;
            }

            // Accum the output
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
            bufUpto += Character.charCount(codePoint);
        }

        // OK, entire token matched; now see if this is a final
        // state:
        if (scratchArc.isFinal()) {
            matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
            matchInputLength = tokenCount;
            matchEndOffset = inputEndOffset;
            //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
        }

        // See if the FST wants to continue matching (ie, needs to
        // see the next input token):
        if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
            // No further rules can match here; we're done
            // searching for matching rules starting at the
            // current input position.
            break;
        } else {
            // More matching is possible -- accum the output (if
            // any) of the WORD_SEP arc:
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            if (nextRead == nextWrite) {
                capture();
            }
        }

        curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
        //System.out.println("  skip write slot=" + nextWrite);
        nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
        //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
        inputSkipCount = matchInputLength;
        addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
        // Even though we had no match here, we set to 1
        // because we need to skip current input token before
        // trying to match again:
        inputSkipCount = 1;
    } else {
        assert finished;
    }

    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
}

From source file:org.elasticsearch.synonym.DynamicSynonymFilter.java

License:Apache License

private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
    for (int outputIDX = 0; outputIDX < count; outputIDX++) {
        synonyms.words.get(bytesReader.readVInt(), scratchBytes);
        //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
        UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
        int lastStart = scratchChars.offset;
        final int chEnd = lastStart + scratchChars.length;
        int outputUpto = nextRead;
        for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
            if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
                final int outputLen = chIDX - lastStart;
                // Caller is not allowed to have empty string in
                // the output:
                assert outputLen > 0 : "output contains empty string: " + scratchChars;
                final int endOffset;
                final int posLen;
                if (chIDX == chEnd && lastStart == scratchChars.offset) {
                    // This rule had a single output token, so, we set
                    // this output's endOffset to the current
                    // endOffset (ie, endOffset of the last input
                    // token it matched):
                    endOffset = matchEndOffset;
                    posLen = keepOrig ? matchInputLength : 1;
                } else {
                    // This rule has more than one output token; we
                    // can't pick any particular endOffset for this
                    // case, so, we inherit the endOffset for the
                    // input token which this output overlaps:
                    endOffset = -1;//from   w w w .j  a  v  a 2s  . c  om
                    posLen = 1;
                }
                futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset, posLen);
                //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
                lastStart = 1 + chIDX;
                //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
                outputUpto = rollIncr(outputUpto);
                assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite="
                        + nextWrite;
            }
        }
    }

    int upto = nextRead;
    for (int idx = 0; idx < matchInputLength; idx++) {
        futureInputs[upto].keepOrig |= keepOrig;
        futureInputs[upto].matched = true;
        upto = rollIncr(upto);
    }
}

From source file:pl.litwiniuk.rowicki.modsynonyms.ModificatedSynonymFilter.java

License:Apache License

private void parse() throws IOException {
    //System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;/*from w  ww .j av  a 2s  . c  o m*/
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken: while (true) {

        // Pull next token's chars:
        final char[] buffer;
        final int bufferLen;
        //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

        int inputEndOffset = 0;

        if (curNextRead == nextWrite) {

            // We used up our lookahead buffer of input tokens
            // -- pull next real input token:

            if (finished) {
                break;
            } else {
                //System.out.println("  input.incrToken");
                assert futureInputs[nextWrite].consumed;
                // Not correct: a syn match whose output is longer
                // than its input can set future inputs keepOrig
                // to true:
                //assert !futureInputs[nextWrite].keepOrig;
                if (input.incrementToken()) {
                    if (typeAtt.type().equals(CollocationFilter.TOKEN_TYPE_SYNONYM)) {
                        hasCollocation = true;
                        break;
                    }
                    buffer = termAtt.buffer();
                    bufferLen = termAtt.length();
                    final PendingInput input = futureInputs[nextWrite];
                    lastStartOffset = input.startOffset = offsetAtt.startOffset();
                    lastEndOffset = input.endOffset = offsetAtt.endOffset();
                    inputEndOffset = input.endOffset;
                    //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                    if (nextRead != nextWrite) {
                        capture();
                    } else {
                        input.consumed = false;
                    }

                } else {
                    // No more input tokens
                    //System.out.println("      set end");
                    finished = true;
                    break;
                }
            }
        } else {
            // Still in our lookahead
            buffer = futureInputs[curNextRead].term.chars;
            bufferLen = futureInputs[curNextRead].term.length;
            inputEndOffset = futureInputs[curNextRead].endOffset;
            //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
        }

        tokenCount++;

        // Run each char in this token through the FST:
        int bufUpto = 0;
        while (bufUpto < bufferLen) {
            final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
            if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc,
                    scratchArc, fstReader) == null) {
                //System.out.println("    stop");
                break byToken;
            }

            // Accum the output
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
            bufUpto += Character.charCount(codePoint);
        }

        // OK, entire token matched; now see if this is a final
        // state:
        if (scratchArc.isFinal()) {
            matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
            matchInputLength = tokenCount;
            matchEndOffset = inputEndOffset;
            //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
        }

        // See if the FST wants to continue matching (ie, needs to
        // see the next input token):
        if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
            // No further rules can match here; we're done
            // searching for matching rules starting at the
            // current input position.
            break;
        } else {
            // More matching is possible -- accum the output (if
            // any) of the WORD_SEP arc:
            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
            if (nextRead == nextWrite) {
                capture();
            }
        }

        curNextRead = rollIncr(curNextRead);
    }
    if (typeAtt.type().equals(CollocationFilter.TOKEN_TYPE_SYNONYM))
        return;
    if (nextRead == nextWrite && !finished) {
        //System.out.println("  skip write slot=" + nextWrite);
        nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
        //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
        inputSkipCount = matchInputLength;
        addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
        // Even though we had no match here, we set to 1
        // because we need to skip current input token before
        // trying to match again:
        inputSkipCount = 1;
    } else {
        assert finished;
    }

    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
}