List of usage examples for org.apache.lucene.util.fst Outputs getNoOutput
public abstract T getNoOutput();
From source file:BuildFST.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {
boolean numeric = true;
boolean negative = false;
for (int i = 0; i < args.length; i++) {
int j = args[i].lastIndexOf('/');
if (j != -1) {
try {
negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
} catch (NumberFormatException nfe) {
numeric = false;/* w w w . j ava 2 s . c o m*/
break;
}
}
}
Outputs outputs;
if (numeric) {
if (negative) {
throw new RuntimeException("can only handle numeric outputs >= 0");
}
outputs = PositiveIntOutputs.getSingleton();
} else {
outputs = ByteSequenceOutputs.getSingleton();
}
Pair<?>[] inputs = new Pair[args.length];
for (int i = 0; i < args.length; i++) {
int j = args[i].lastIndexOf('/');
String input;
Object output;
if (j == -1) {
output = outputs.getNoOutput();
input = args[i];
} else {
input = args[i].substring(0, j);
String outputString = args[i].substring(j + 1);
if (numeric) {
output = Long.parseLong(outputString);
} else {
output = new BytesRef(outputString);
}
}
inputs[i] = new Pair(new BytesRef(input), output);
}
Arrays.sort(inputs);
FST<?> fst;
if (numeric) {
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
for (Pair pair : inputs) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(pair.input, intsBuilder);
b.add(intsBuilder.get(), (Long) pair.output);
}
fst = b.finish();
} else {
Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
for (Pair pair : inputs) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(pair.input, intsBuilder);
b.add(intsBuilder.get(), (BytesRef) pair.output);
}
fst = b.finish();
}
Util.toDot(fst, new PrintWriter(System.out), true, true);
}
From source file:elhuyar.bilakit.Stemmer.java
License:Apache License
/** * Generates a list of stems for the provided word * * @param word Word to generate the stems for * @param previous previous affix that was removed (so we dont remove same one twice) * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word * @param recursionDepth current recursiondepth * @param doPrefix true if we should remove prefixes * @param doSuffix true if we should remove suffixes * @param previousWasPrefix true if the previous removal was a prefix: * if we are removing a suffix, and it has no continuation requirements, its ok. * but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. * @param circumfix true if the previous prefix removal was signed as a circumfix * this means inner most suffix must also contain circumfix flag. * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. * @return List of stems, or empty list if no stems are found */// w w w . jav a 2s. c om private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException { // TODO: allow this stuff to be reused by tokenfilter List<CharsRef> stems = new ArrayList<>(); if (doPrefix && dictionary.prefixes != null) { FST<IntsRef> fst = dictionary.prefixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef prefixes = null; if (!arc.isFinal()) { continue; } else { prefixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < prefixes.length; j++) { int prefix = prefixes.ints[prefixes.offset + j]; if (prefix == previous) { continue; } affixReader.setPosition(8 * prefix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.addAll(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST<IntsRef> fst = dictionary.suffixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc<IntsRef> arc = suffixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef suffixes = null; if (!arc.isFinal()) { continue; } else { suffixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < suffixes.length; j++) { int suffix = suffixes.ints[suffixes.offset + j]; if (suffix == previous) { continue; } affixReader.setPosition(8 * suffix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; boolean allowed = dictionary.onlyincompound == -1 || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound); compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(word, 0, strippedWord, 0, deAffixedLength); System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.addAll(stemList); } } } } return stems; }
From source file:stemmer.Stemmer.java
License:Apache License
/** * Generates a list of stems for the provided word * * @param word Word to generate the stems for * @param previous previous affix that was removed (so we dont remove same one twice) * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word * @param recursionDepth current recursiondepth * @param doPrefix true if we should remove prefixes * @param doSuffix true if we should remove suffixes * @param previousWasPrefix true if the previous removal was a prefix: * if we are removing a suffix, and it has no continuation requirements, its ok. * but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. * @param circumfix true if the previous prefix removal was signed as a circumfix * this means inner most suffix must also contain circumfix flag. * @return List of stems, or empty list if no stems are found *//* w w w.j av a 2 s. c o m*/ private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException { // TODO: allow this stuff to be reused by tokenfilter List<CharsRef> stems = new ArrayList<>(); if (doPrefix && dictionary.prefixes != null) { FST<IntsRef> fst = dictionary.prefixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; for (int i = 0; i < length; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef prefixes = null; if (!arc.isFinal()) { continue; } else { prefixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < prefixes.length; j++) { int prefix = prefixes.ints[prefixes.offset + j]; if (prefix == previous) { continue; } affixReader.setPosition(8 * prefix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix); stems.addAll(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST<IntsRef> fst = dictionary.suffixes; Outputs<IntsRef> outputs = fst.outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc<IntsRef> arc = suffixArcs[recursionDepth]; fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; for (int i = length; i >= 0; i--) { if (i < length) { int ch = word[i]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } IntsRef suffixes = null; if (!arc.isFinal()) { continue; } else { suffixes = fst.outputs.add(output, arc.nextFinalOutput); } for (int j = 0; j < suffixes.length; j++) { int suffix = suffixes.ints[suffixes.offset + j]; if (suffix == previous) { continue; } affixReader.setPosition(8 * suffix); char flag = (char) (affixReader.readShort() & 0xffff); char stripOrd = (char) (affixReader.readShort() & 0xffff); int condition = (char) (affixReader.readShort() & 0xffff); boolean crossProduct = (condition & 1) == 1; condition >>>= 1; char append = (char) (affixReader.readShort() & 0xffff); final boolean compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); assert prevFlag >= 0; compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char strippedWord[] = new char[stripLength + deAffixedLength]; System.arraycopy(word, 0, strippedWord, 0, deAffixedLength); System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix); stems.addAll(stemList); } } } } return stems; }