Example usage for org.apache.lucene.util.fst Outputs getNoOutput

List of usage examples for org.apache.lucene.util.fst Outputs getNoOutput

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Outputs getNoOutput.

Prototype

public abstract T getNoOutput();

Source Link

Document

NOTE: this output is compared with == so you must ensure that all methods return the single object if it's really no output

Usage

From source file:BuildFST.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {

    boolean numeric = true;
    boolean negative = false;
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        if (j != -1) {
            try {
                negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
            } catch (NumberFormatException nfe) {
                numeric = false;/* w w  w . j ava  2  s .  c  o  m*/
                break;
            }
        }
    }

    Outputs outputs;
    if (numeric) {
        if (negative) {
            throw new RuntimeException("can only handle numeric outputs >= 0");
        }
        outputs = PositiveIntOutputs.getSingleton();
    } else {
        outputs = ByteSequenceOutputs.getSingleton();
    }

    Pair<?>[] inputs = new Pair[args.length];
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        String input;
        Object output;
        if (j == -1) {
            output = outputs.getNoOutput();
            input = args[i];
        } else {
            input = args[i].substring(0, j);
            String outputString = args[i].substring(j + 1);
            if (numeric) {
                output = Long.parseLong(outputString);
            } else {
                output = new BytesRef(outputString);
            }
        }
        inputs[i] = new Pair(new BytesRef(input), output);
    }
    Arrays.sort(inputs);

    FST<?> fst;
    if (numeric) {
        Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (Long) pair.output);
        }
        fst = b.finish();
    } else {
        Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (BytesRef) pair.output);
        }
        fst = b.finish();
    }
    Util.toDot(fst, new PrintWriter(System.out), true, true);
}

From source file:elhuyar.bilakit.Stemmer.java

License:Apache License

/**
 * Generates a list of stems for the provided word
 *
 * @param word Word to generate the stems for
 * @param previous previous affix that was removed (so we dont remove same one twice)
 * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
 * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word
 * @param recursionDepth current recursiondepth
 * @param doPrefix true if we should remove prefixes
 * @param doSuffix true if we should remove suffixes
 * @param previousWasPrefix true if the previous removal was a prefix:
 *        if we are removing a suffix, and it has no continuation requirements, its ok.
 *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
 * @param circumfix true if the previous prefix removal was signed as a circumfix
 *        this means inner most suffix must also contain circumfix flag.
 * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
 * @return List of stems, or empty list if no stems are found
 *///  w w w  . jav  a  2s. c  om
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag,
        int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix,
        boolean caseVariant) throws IOException {

    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();

    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? length : length - 1;
        for (int i = 0; i < limit; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char appendFlags[] = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1
                            || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word,
                            deAffixedStart, deAffixedLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1,
                            recursionDepth, true, circumfix, caseVariant);

                    stems.addAll(stemList);
                }
            }
        }
    }

    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? 0 : 1;
        for (int i = length; i >= limit; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char appendFlags[] = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1
                            || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed
                            && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart,
                            stripLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength,
                            stripLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag,
                            recursionDepth, false, circumfix, caseVariant);

                    stems.addAll(stemList);
                }
            }
        }
    }

    return stems;
}

From source file:stemmer.Stemmer.java

License:Apache License

/**
 * Generates a list of stems for the provided word
 *
 * @param word Word to generate the stems for
 * @param previous previous affix that was removed (so we dont remove same one twice)
 * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
 * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word
 * @param recursionDepth current recursiondepth
 * @param doPrefix true if we should remove prefixes
 * @param doSuffix true if we should remove suffixes
 * @param previousWasPrefix true if the previous removal was a prefix:
 *        if we are removing a suffix, and it has no continuation requirements, its ok.
 *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
 * @param circumfix true if the previous prefix removal was signed as a circumfix
 *        this means inner most suffix must also contain circumfix flag.
 * @return List of stems, or empty list if no stems are found
 *//*  w  w w.j av a  2  s.  c o m*/
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag,
        int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix)
        throws IOException {

    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();

    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        for (int i = 0; i < length; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    compatible = true;
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word,
                            deAffixedStart, deAffixedLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1,
                            recursionDepth, true, circumfix);

                    stems.addAll(stemList);
                }
            }
        }
    }

    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        for (int i = length; i >= 0; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    compatible = true;
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart,
                            stripLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength,
                            stripLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag,
                            recursionDepth, false, circumfix);

                    stems.addAll(stemList);
                }
            }
        }
    }

    return stems;
}