List of usage examples for org.apache.lucene.analysis.synonym SynonymMap WORD_SEPARATOR
char WORD_SEPARATOR
To view the source code for org.apache.lucene.analysis.synonym SynonymMap WORD_SEPARATOR.
Click Source Link
From source file:com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymFilter.java
License:Apache License
private void parse() throws IOException { assert inputSkipCount == 0; int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null;//from w w w . j a v a2 s .co m int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); int tokenCount = 0; byToken: while (true) { // Pull next token's chars: final char[] buffer; final int bufferLen; int inputEndOffset; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { assert futureInputs[nextWrite].consumed; // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: if (input.incrementToken()) { buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; lastStartOffset = input.startOffset = offsetAtt.startOffset(); lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars(); bufferLen = futureInputs[curNextRead].term.length(); inputEndOffset = futureInputs[curNextRead].endOffset; } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += Character.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); } if (nextRead == nextWrite && !finished) { nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { assert finished; } }
From source file:com.bellszhu.elasticsearch.plugin.synonym.analysis.DynamicSynonymFilter.java
License:Apache License
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; final int count = code >>> 1; for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.words.get(bytesReader.readVInt(), scratchBytes); scratchChars.copyUTF8Bytes(scratchBytes); int lastStart = 0; final int chEnd = lastStart + scratchChars.length(); int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) { final int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: assert outputLen > 0 : "output contains empty string: " + scratchChars; final int endOffset; final int posLen; if (chIDX == chEnd && lastStart == 0) { // This rule had a single output token, so, we set // this output's endOffset to the current // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this // case, so, we inherit the endOffset for the // input token which this output overlaps: endOffset = -1;/* w ww . j a va2 s. c o m*/ posLen = 1; } futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen); lastStart = 1 + chIDX; outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = rollIncr(upto); } }
From source file:com.github.le11.nls.lucene.TypeAwareSynonymFilter.java
License:Apache License
private void parse() throws IOException { //System.out.println("\nS: parse"); assert inputSkipCount == 0; int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null;// w w w. j a v a2s .co m int matchInputLength = 0; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); int tokenCount = 0; byToken: while (true) { // Pull next token's chars: final char[] buffer; final int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); assert futureInputs[nextWrite].consumed; // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.incrementToken()) { buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; input.startOffset = offsetAtt.startOffset(); input.endOffset = offsetAtt.endOffset(); //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars; bufferLen = futureInputs[curNextRead].term.length; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) { //System.out.println(" stop"); break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); } if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { assert finished; } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
From source file:com.github.le11.nls.lucene.TypeAwareSynonymFilter.java
License:Apache License
private void addOutput(BytesRef bytes, int matchInputLength) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; final int count = code >>> 1; //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.words.get(bytesReader.readVInt(), scratchBytes); //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars); int lastStart = scratchChars.offset; final int chEnd = lastStart + scratchChars.length; int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) { final int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: assert outputLen > 0 : "output contains empty string: " + scratchChars; futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1 + chIDX;//from www.ja va 2s . co m //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = rollIncr(upto); } }
From source file:org.codelibs.elasticsearch.synonym.analysis.SynonymFilter.java
License:Apache License
private void parse() throws IOException { //System.out.println("\nS: parse"); assert inputSkipCount == 0; int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null;/* w w w. j a va 2s.c o m*/ int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); int tokenCount = 0; byToken: while (true) { // Pull next token's chars: final char[] buffer; final int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); assert futureInputs[nextWrite].consumed; // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.incrementToken()) { buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; lastStartOffset = input.startOffset = offsetAtt.startOffset(); lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars(); bufferLen = futureInputs[curNextRead].term.length(); inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); } if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { assert finished; } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
From source file:org.codelibs.elasticsearch.synonym.analysis.SynonymFilter.java
License:Apache License
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; final int count = code >>> 1; //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.words.get(bytesReader.readVInt(), scratchBytes); //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); scratchChars.copyUTF8Bytes(scratchBytes); int lastStart = 0; final int chEnd = lastStart + scratchChars.length(); int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) { final int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: assert outputLen > 0 : "output contains empty string: " + scratchChars; final int endOffset; final int posLen; if (chIDX == chEnd && lastStart == 0) { // This rule had a single output token, so, we set // this output's endOffset to the current // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this // case, so, we inherit the endOffset for the // input token which this output overlaps: endOffset = -1;//from ww w. ja v a2s.co m posLen = 1; } futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1 + chIDX; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = rollIncr(upto); } }
From source file:org.elasticsearch.synonym.DynamicSynonymFilter.java
License:Apache License
private void parse() throws IOException { //System.out.println("\nS: parse"); assert inputSkipCount == 0; int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null;//from w ww .j av a2s. c om int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); int tokenCount = 0; byToken: while (true) { // Pull next token's chars: final char[] buffer; final int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); assert futureInputs[nextWrite].consumed; // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.incrementToken()) { buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; lastStartOffset = input.startOffset = offsetAtt.startOffset(); lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars; bufferLen = futureInputs[curNextRead].term.length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); } if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { assert finished; } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }
From source file:org.elasticsearch.synonym.DynamicSynonymFilter.java
License:Apache License
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; final int count = code >>> 1; //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.words.get(bytesReader.readVInt(), scratchBytes); //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars); int lastStart = scratchChars.offset; final int chEnd = lastStart + scratchChars.length; int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) { final int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: assert outputLen > 0 : "output contains empty string: " + scratchChars; final int endOffset; final int posLen; if (chIDX == chEnd && lastStart == scratchChars.offset) { // This rule had a single output token, so, we set // this output's endOffset to the current // endOffset (ie, endOffset of the last input // token it matched): endOffset = matchEndOffset; posLen = keepOrig ? matchInputLength : 1; } else { // This rule has more than one output token; we // can't pick any particular endOffset for this // case, so, we inherit the endOffset for the // input token which this output overlaps: endOffset = -1;//from w w w .j a v a 2s . c om posLen = 1; } futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset, posLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1 + chIDX; //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = rollIncr(upto); } }
From source file:pl.litwiniuk.rowicki.modsynonyms.ModificatedSynonymFilter.java
License:Apache License
private void parse() throws IOException { //System.out.println("\nS: parse"); assert inputSkipCount == 0; int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null;/*from w ww .j av a 2s . c o m*/ int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); int tokenCount = 0; byToken: while (true) { // Pull next token's chars: final char[] buffer; final int bufferLen; //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { //System.out.println(" input.incrToken"); assert futureInputs[nextWrite].consumed; // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: //assert !futureInputs[nextWrite].keepOrig; if (input.incrementToken()) { if (typeAtt.type().equals(CollocationFilter.TOKEN_TYPE_SYNONYM)) { hasCollocation = true; break; } buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; lastStartOffset = input.startOffset = offsetAtt.startOffset(); lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; //System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens //System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars; bufferLen = futureInputs[curNextRead].term.length; inputEndOffset = futureInputs[curNextRead].endOffset; //System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output); bufUpto += Character.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); } if (typeAtt.type().equals(CollocationFilter.TOKEN_TYPE_SYNONYM)) return; if (nextRead == nextWrite && !finished) { //System.out.println(" skip write slot=" + nextWrite); nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { assert finished; } //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); }