Example usage for org.apache.lucene.util.automaton RegExp RegExp

List of usage examples for org.apache.lucene.util.automaton RegExp RegExp

Introduction

In this page you can find the example usage for org.apache.lucene.util.automaton RegExp RegExp.

Prototype

public RegExp(String s, int syntax_flags) throws IllegalArgumentException 

Source Link

Document

Constructs new RegExp from a string.

Usage

From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java

License:Apache License

public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
    if (leftTerms == null || rightTerms == null) {
        assertNull(leftTerms);//from w ww . j a va 2 s  .  c o  m
        assertNull(rightTerms);
        return;
    }
    assertTermsStatistics(leftTerms, rightTerms);

    // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

    boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
    TermsEnum leftTermsEnum = leftTerms.iterator();
    TermsEnum rightTermsEnum = rightTerms.iterator();
    assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHavePositions);

    assertTermsSeeking(leftTerms, rightTerms);

    if (deep) {
        int numIntersections = atLeast(3);
        for (int i = 0; i < numIntersections; i++) {
            String re = AutomatonTestUtil.randomRegexp(random());
            CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
            if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // TODO: test start term too
                TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
                TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
                assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHavePositions);
            }
        }
    }
}

From source file:com.sindicetech.siren.search.node.NodeRegexpQuery.java

License:Open Source License

/**
 * Constructs a query for terms matching <code>term</code>.
 *
 * @param term regular expression./*  w w  w  .  ja va 2  s.c om*/
 * @param flags optional RegExp features from {@link RegExp}
 * @param provider custom AutomatonProvider for named automata
 */
public NodeRegexpQuery(final Term term, final int flags, final AutomatonProvider provider) {
    super(term, new RegExp(term.text(), flags).toAutomaton(provider));
}

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * /*w ww  . java  2  s .c  o  m*/
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but its a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * //from ww  w .  ja v  a 2s  .c  om
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but it's a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Parses a specific affix rule putting the result into the provided affix map
 * //  ww  w  .j av  a  2 s .  c  o m
 * @param affixes Map where the result of the parsing will be put
 * @param header Header line of the affix rule
 * @param reader BufferedReader to read the content of the rule from
 * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
 *                         pattern
 * @param seenPatterns map from condition -> index of patterns, for deduplication.
 * @throws IOException Can be thrown while reading the rule
 */
private void parseAffix(TreeMap<String, List<Character>> affixes, String header, LineNumberReader reader,
        String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
        throws IOException, ParseException {

    BytesRef scratch = new BytesRef();
    StringBuilder sb = new StringBuilder();
    String args[] = header.split("\\s+");

    boolean crossProduct = args[2].equals("Y");
    boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

    int numLines = Integer.parseInt(args[3]);
    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

    for (int i = 0; i < numLines; i++) {
        assert affixWriter.getPosition() == currentAffix << 3;
        String line = reader.readLine();
        String ruleArgs[] = line.split("\\s+");

        // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
        // condition is optional
        if (ruleArgs.length < 4) {
            throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                    reader.getLineNumber());
        }

        char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
        String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
        String affixArg = ruleArgs[3];
        char appendFlags[] = null;

        int flagSep = affixArg.lastIndexOf('/');
        if (flagSep != -1) {
            String flagPart = affixArg.substring(flagSep + 1);
            affixArg = affixArg.substring(0, flagSep);

            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }

            appendFlags = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(appendFlags);
            twoStageAffix = true;
        }

        // TODO: add test and fix zero-affix handling!

        String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
        // at least the gascon affix file has this issue
        if (condition.startsWith("[") && condition.indexOf(']') == -1) {
            condition = condition + "]";
        }
        // "dash hasn't got special meaning" (we must escape it)
        if (condition.indexOf('-') >= 0) {
            condition = escapeDash(condition);
        }

        final String regex;
        if (".".equals(condition)) {
            regex = ".*"; // Zero condition is indicated by dot
        } else if (condition.equals(strip)) {
            regex = ".*"; // TODO: optimize this better:
                          // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                          // but this is complicated...
        } else {
            regex = String.format(Locale.ROOT, conditionPattern, condition);
        }

        // deduplicate patterns
        Integer patternIndex = seenPatterns.get(regex);
        if (patternIndex == null) {
            patternIndex = patterns.size();
            if (patternIndex > Short.MAX_VALUE) {
                throw new UnsupportedOperationException(
                        "Too many patterns, please report this to dev@lucene.apache.org");
            }
            seenPatterns.put(regex, patternIndex);
            CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                    new RegExp(regex, RegExp.NONE).toAutomaton());
            patterns.add(pattern);
        }

        Integer stripOrd = seenStrips.get(strip);
        if (stripOrd == null) {
            stripOrd = seenStrips.size();
            seenStrips.put(strip, stripOrd);
            if (stripOrd > Character.MAX_VALUE) {
                throw new UnsupportedOperationException(
                        "Too many unique strips, please report this to dev@lucene.apache.org");
            }
        }

        if (appendFlags == null) {
            appendFlags = NOFLAGS;
        }

        encodeFlags(scratch, appendFlags);
        int appendFlagsOrd = flagLookup.add(scratch);
        if (appendFlagsOrd < 0) {
            // already exists in our hash
            appendFlagsOrd = (-appendFlagsOrd) - 1;
        } else if (appendFlagsOrd > Short.MAX_VALUE) {
            // this limit is probably flexible, but its a good sanity check too
            throw new UnsupportedOperationException(
                    "Too many unique append flags, please report this to dev@lucene.apache.org");
        }

        affixWriter.writeShort((short) flag);
        affixWriter.writeShort((short) stripOrd.intValue());
        // encode crossProduct into patternIndex
        int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
        affixWriter.writeShort((short) patternOrd);
        affixWriter.writeShort((short) appendFlagsOrd);

        if (needsInputCleaning) {
            CharSequence cleaned = cleanInput(affixArg, sb);
            affixArg = cleaned.toString();
        }

        if (isSuffix) {
            affixArg = new StringBuilder(affixArg).reverse().toString();
        }

        List<Character> list = affixes.get(affixArg);
        if (list == null) {
            list = new ArrayList<>();
            affixes.put(affixArg, list);
        }

        list.add((char) currentAffix);
        currentAffix++;
    }
}