List of usage examples for org.apache.lucene.util.fst FST getFirstArc
public Arc<T> getFirstArc(Arc<T> arc)
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) { if (fst == null) { return null; }/*from w w w . j a v a 2 s . c o m*/ final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>()); // Accumulate output as we go final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int l = offset + length; try { for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { cp = Character.codePointAt(word, i, l); if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } } catch (IOException bogus) { throw new RuntimeException(bogus); } }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException { final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>()); final CharsRef NO_OUTPUT = fst.outputs.getNoOutput(); // temporary stuff final FST.Arc<CharsRef> arc = new FST.Arc<>(); int longestMatch; CharsRef longestOutput;/*from ww w.j av a 2s .c o m*/ for (int i = 0; i < sb.length(); i++) { arc.copyFrom(firstArc); CharsRef output = NO_OUTPUT; longestMatch = -1; longestOutput = null; for (int j = i; j < sb.length(); j++) { char ch = sb.charAt(j); if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else { output = fst.outputs.add(output, arc.output); } if (arc.isFinal()) { longestOutput = fst.outputs.add(output, arc.nextFinalOutput); longestMatch = j; } } if (longestMatch >= 0) { sb.delete(i, longestMatch + 1); sb.insert(i, longestOutput); i += (longestOutput.length - 1); } } }
From source file:elhuyar.bilakit.Stemmer.java
License:Apache License
/** * Generates a list of stems for the provided word * * @param word Word to generate the stems for * @param previous previous affix that was removed (so we dont remove same one twice) * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word * @param recursionDepth current recursiondepth * @param doPrefix true if we should remove prefixes * @param doSuffix true if we should remove suffixes * @param previousWasPrefix true if the previous removal was a prefix: * if we are removing a suffix, and it has no continuation requirements, its ok. * but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. * @param circumfix true if the previous prefix removal was signed as a circumfix * this means inner most suffix must also contain circumfix flag. * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. * @return List of stems, or empty list if no stems are found *///from ww w .ja v a2 s.c o m private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException { // TODO: allow this stuff to be reused by tokenfilter List<CharsRef> stems = new ArrayList<>(); if (doPrefix && dictionary.prefixes != null) { FST<IntsRef> fst = dictionary.prefixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef prefixes = null; if (!arc.isFinal()) { continue; } else { prefixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < prefixes.length; j++) { int prefix = prefixes.ints[prefixes.offset + j]; if (prefix == previous) { continue; } affixReader.setPosition(8 * prefix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.addAll(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST<IntsRef> fst = dictionary.suffixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc<IntsRef> arc = suffixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef suffixes = null; if (!arc.isFinal()) { continue; } else { suffixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < suffixes.length; j++) { int suffix = suffixes.ints[suffixes.offset + j]; if (suffix == previous) { continue; } affixReader.setPosition(8 * suffix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(word, 0, strippedWord, 0, deAffixedLength); System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.addAll(stemList); } } } } return stems; }
From source file:stemmer.Dictionary.java
License:Apache License
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) { if (fst == null) { return null; }/* w w w . j a va2s . c o m*/ final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>()); // Accumulate output as we go final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int l = offset + length; try { for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { cp = Character.codePointAt(word, i, l); if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } } catch (IOException bogus) { throw new RuntimeException(bogus); } }
From source file:stemmer.Dictionary.java
License:Apache License
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException { final FST.BytesReader bytesReader = fst.getBytesReader(); final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>()); final CharsRef NO_OUTPUT = fst.outputs.getNoOutput(); // temporary stuff final FST.Arc<CharsRef> arc = new FST.Arc<>(); int longestMatch; CharsRef longestOutput;/*from w ww.j ava 2 s . co m*/ for (int i = 0; i < sb.length(); i++) { arc.copyFrom(firstArc); CharsRef output = NO_OUTPUT; longestMatch = -1; longestOutput = null; for (int j = i; j < sb.length(); j++) { char ch = sb.charAt(j); if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else { output = fst.outputs.add(output, arc.output); } if (arc.isFinal()) { longestOutput = fst.outputs.add(output, arc.nextFinalOutput); longestMatch = j; } } if (longestMatch >= 0) { sb.delete(i, longestMatch + 1); sb.insert(i, longestOutput); i += (longestOutput.length - 1); } } }
From source file:stemmer.Stemmer.java
License:Apache License
/** * Generates a list of stems for the provided word * * @param word Word to generate the stems for * @param previous previous affix that was removed (so we dont remove same one twice) * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word * @param recursionDepth current recursiondepth * @param doPrefix true if we should remove prefixes * @param doSuffix true if we should remove suffixes * @param previousWasPrefix true if the previous removal was a prefix: * if we are removing a suffix, and it has no continuation requirements, its ok. * but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. * @param circumfix true if the previous prefix removal was signed as a circumfix * this means inner most suffix must also contain circumfix flag. * @return List of stems, or empty list if no stems are found *///from w w w . j av a 2 s . co m private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException { // TODO: allow this stuff to be reused by tokenfilter List<CharsRef> stems = new ArrayList<>(); if (doPrefix && dictionary.prefixes != null) { FST<IntsRef> fst = dictionary.prefixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; for (int i = 0; i < length; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef prefixes = null; if (!arc.isFinal()) { continue; } else { prefixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < prefixes.length; j++) { int prefix = prefixes.ints[prefixes.offset + j]; if (prefix == previous) { continue; } affixReader.setPosition(8 * prefix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix); stems.addAll(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST<IntsRef> fst = dictionary.suffixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc<IntsRef> arc = suffixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; for (int i = length; i >= 0; i--) { if (i < length) { int ch = word[i]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef suffixes = null; if (!arc.isFinal()) { continue; } else { suffixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < suffixes.length; j++) { int suffix = suffixes.ints[suffixes.offset + j]; if (suffix == previous) { continue; } affixReader.setPosition(8 * suffix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(word, 0, strippedWord, 0, deAffixedLength); System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix); stems.addAll(stemList); } } } } return stems; }