Example usage for org.apache.lucene.util.fst FST findTargetArc

List of usage examples for org.apache.lucene.util.fst FST findTargetArc

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst FST findTargetArc.

Prototype

public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException 

Source Link

Document

Finds an arc leaving the incoming arc, replacing the arc in place.

Usage

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
      if (fst == null) {
          return null;
      }/*from w w  w .  j av  a 2s  . c  om*/
      final FST.BytesReader bytesReader = fst.getBytesReader();
      final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
      // Accumulate output as we go
      final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
      IntsRef output = NO_OUTPUT;

      int l = offset + length;
      try {
          for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
              cp = Character.codePointAt(word, i, l);
              if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
                  return null;
              } else if (arc.output != NO_OUTPUT) {
                  output = fst.outputs.add(output, arc.output);
              }
          }
          if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
              return null;
          } else if (arc.output != NO_OUTPUT) {
              return fst.outputs.add(output, arc.output);
          } else {
              return output;
          }
      } catch (IOException bogus) {
          throw new RuntimeException(bogus);
      }
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
      final FST.BytesReader bytesReader = fst.getBytesReader();
      final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
      final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();

      // temporary stuff
      final FST.Arc<CharsRef> arc = new FST.Arc<>();
      int longestMatch;
      CharsRef longestOutput;/*from www.ja  v a 2 s.  c  o  m*/

      for (int i = 0; i < sb.length(); i++) {
          arc.copyFrom(firstArc);
          CharsRef output = NO_OUTPUT;
          longestMatch = -1;
          longestOutput = null;

          for (int j = i; j < sb.length(); j++) {
              char ch = sb.charAt(j);
              if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                  break;
              } else {
                  output = fst.outputs.add(output, arc.output);
              }
              if (arc.isFinal()) {
                  longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
                  longestMatch = j;
              }
          }

          if (longestMatch >= 0) {
              sb.delete(i, longestMatch + 1);
              sb.insert(i, longestOutput);
              i += (longestOutput.length - 1);
          }
      }
  }

From source file:elhuyar.bilakit.Stemmer.java

License:Apache License

/**
 * Generates a list of stems for the provided word
 *
 * @param word Word to generate the stems for
 * @param previous previous affix that was removed (so we dont remove same one twice)
 * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
 * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word
 * @param recursionDepth current recursiondepth
 * @param doPrefix true if we should remove prefixes
 * @param doSuffix true if we should remove suffixes
 * @param previousWasPrefix true if the previous removal was a prefix:
 *        if we are removing a suffix, and it has no continuation requirements, its ok.
 *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
 * @param circumfix true if the previous prefix removal was signed as a circumfix
 *        this means inner most suffix must also contain circumfix flag.
 * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
 * @return List of stems, or empty list if no stems are found
 *//* www .j  a v  a  2  s  .com*/
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag,
        int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix,
        boolean caseVariant) throws IOException {

    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();

    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? length : length - 1;
        for (int i = 0; i < limit; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char appendFlags[] = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1
                            || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed && hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word,
                            deAffixedStart, deAffixedLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1,
                            recursionDepth, true, circumfix, caseVariant);

                    stems.addAll(stemList);
                }
            }
        }
    }

    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        int limit = dictionary.fullStrip ? 0 : 1;
        for (int i = length; i >= limit; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    if (dictionary.onlyincompound == -1) {
                        compatible = true;
                    } else {
                        // check if affix is allowed in a non-compound word
                        dictionary.flagLookup.get(append, scratch);
                        char appendFlags[] = Dictionary.decodeFlags(scratch);
                        compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    }
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    boolean allowed = dictionary.onlyincompound == -1
                            || !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
                    compatible = allowed
                            && hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart,
                            stripLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength,
                            stripLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag,
                            recursionDepth, false, circumfix, caseVariant);

                    stems.addAll(stemList);
                }
            }
        }
    }

    return stems;
}

From source file:stemmer.Dictionary.java

License:Apache License

IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
    if (fst == null) {
        return null;
    }//www .  j  a  v a2s  . c om
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
    // Accumulate output as we go
    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
    IntsRef output = NO_OUTPUT;

    int l = offset + length;
    try {
        for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
            cp = Character.codePointAt(word, i, l);
            if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
                return null;
            } else if (arc.output != NO_OUTPUT) {
                output = fst.outputs.add(output, arc.output);
            }
        }
        if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
            return null;
        } else if (arc.output != NO_OUTPUT) {
            return fst.outputs.add(output, arc.output);
        } else {
            return output;
        }
    } catch (IOException bogus) {
        throw new RuntimeException(bogus);
    }
}

From source file:stemmer.Dictionary.java

License:Apache License

static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
    final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();

    // temporary stuff
    final FST.Arc<CharsRef> arc = new FST.Arc<>();
    int longestMatch;
    CharsRef longestOutput;// ww  w. jav  a  2 s  .co m

    for (int i = 0; i < sb.length(); i++) {
        arc.copyFrom(firstArc);
        CharsRef output = NO_OUTPUT;
        longestMatch = -1;
        longestOutput = null;

        for (int j = i; j < sb.length(); j++) {
            char ch = sb.charAt(j);
            if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                break;
            } else {
                output = fst.outputs.add(output, arc.output);
            }
            if (arc.isFinal()) {
                longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
                longestMatch = j;
            }
        }

        if (longestMatch >= 0) {
            sb.delete(i, longestMatch + 1);
            sb.insert(i, longestOutput);
            i += (longestOutput.length - 1);
        }
    }
}

From source file:stemmer.Stemmer.java

License:Apache License

/**
 * Generates a list of stems for the provided word
 *
 * @param word Word to generate the stems for
 * @param previous previous affix that was removed (so we dont remove same one twice)
 * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step
 * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word
 * @param recursionDepth current recursiondepth
 * @param doPrefix true if we should remove prefixes
 * @param doSuffix true if we should remove suffixes
 * @param previousWasPrefix true if the previous removal was a prefix:
 *        if we are removing a suffix, and it has no continuation requirements, its ok.
 *        but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. 
 * @param circumfix true if the previous prefix removal was signed as a circumfix
 *        this means inner most suffix must also contain circumfix flag.
 * @return List of stems, or empty list if no stems are found
 *///  ww w. ja  va 2s.c o m
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag,
        int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix)
        throws IOException {

    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<>();

    if (doPrefix && dictionary.prefixes != null) {
        FST<IntsRef> fst = dictionary.prefixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = prefixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        for (int i = 0; i < length; i++) {
            if (i > 0) {
                int ch = word[i - 1];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef prefixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                prefixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < prefixes.length; j++) {
                int prefix = prefixes.ints[prefixes.offset + j];
                if (prefix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * prefix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    compatible = true;
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, false);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int deAffixedStart = i;
                    int deAffixedLength = length - deAffixedStart;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word,
                            deAffixedStart, deAffixedLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
                    System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1,
                            recursionDepth, true, circumfix);

                    stems.addAll(stemList);
                }
            }
        }
    }

    if (doSuffix && dictionary.suffixes != null) {
        FST<IntsRef> fst = dictionary.suffixes;
        Outputs<IntsRef> outputs = fst.outputs;
        FST.BytesReader bytesReader = suffixReaders[recursionDepth];
        FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
        fst.getFirstArc(arc);
        IntsRef NO_OUTPUT = outputs.getNoOutput();
        IntsRef output = NO_OUTPUT;
        for (int i = length; i >= 0; i--) {
            if (i < length) {
                int ch = word[i];
                if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                    break;
                } else if (arc.output != NO_OUTPUT) {
                    output = fst.outputs.add(output, arc.output);
                }
            }
            IntsRef suffixes = null;
            if (!arc.isFinal()) {
                continue;
            } else {
                suffixes = fst.outputs.add(output, arc.nextFinalOutput);
            }

            for (int j = 0; j < suffixes.length; j++) {
                int suffix = suffixes.ints[suffixes.offset + j];
                if (suffix == previous) {
                    continue;
                }
                affixReader.setPosition(8 * suffix);
                char flag = (char) (affixReader.readShort() & 0xffff);
                char stripOrd = (char) (affixReader.readShort() & 0xffff);
                int condition = (char) (affixReader.readShort() & 0xffff);
                boolean crossProduct = (condition & 1) == 1;
                condition >>>= 1;
                char append = (char) (affixReader.readShort() & 0xffff);

                final boolean compatible;
                if (recursionDepth == 0) {
                    compatible = true;
                } else if (crossProduct) {
                    // cross check incoming continuation class (flag of previous affix) against list.
                    dictionary.flagLookup.get(append, scratch);
                    char appendFlags[] = Dictionary.decodeFlags(scratch);
                    assert prevFlag >= 0;
                    compatible = hasCrossCheckedFlag((char) prevFlag, appendFlags, previousWasPrefix);
                } else {
                    compatible = false;
                }

                if (compatible) {
                    int appendLength = length - i;
                    int deAffixedLength = length - appendLength;

                    int stripStart = dictionary.stripOffsets[stripOrd];
                    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
                    int stripLength = stripEnd - stripStart;

                    if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart,
                            stripLength)) {
                        continue;
                    }

                    char strippedWord[] = new char[stripLength + deAffixedLength];
                    System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
                    System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength,
                            stripLength);

                    List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag,
                            recursionDepth, false, circumfix);

                    stems.addAll(stemList);
                }
            }
        }
    }

    return stems;
}