Example usage for org.apache.lucene.util.automaton CharacterRunAutomaton CharacterRunAutomaton

List of usage examples for org.apache.lucene.util.automaton CharacterRunAutomaton CharacterRunAutomaton

Introduction

In this page you can find the example usage for org.apache.lucene.util.automaton CharacterRunAutomaton CharacterRunAutomaton.

Prototype

public CharacterRunAutomaton(Automaton a) 

Source Link

Document

Construct with a default number of maxDeterminizedStates.

Usage

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * //from  w  w  w .  j av a  2s  . c o  m
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but its a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * //from w  w  w.  j  a v a2  s  . co m
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but it's a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:org.apache.solr.analysis.MockTokenFilterFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
    super(args);/* w  w w. j  a v a  2  s  . co m*/
    String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
    String stopregex = get(args, "stopregex");
    if (null != stopset) {
        if (null != stopregex) {
            throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
        }
        if ("english".equalsIgnoreCase(stopset)) {
            filter = MockTokenFilter.ENGLISH_STOPSET;
        } else { // must be "empty"
            filter = MockTokenFilter.EMPTY_STOPSET;
        }
    } else if (null != stopregex) {
        RegExp regex = new RegExp(stopregex);
        filter = new CharacterRunAutomaton(regex.toAutomaton());
    } else {
        throw new IllegalArgumentException(
                "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
    }
    enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.apache.solr.core.MockTokenFilterFactory.java

License:Apache License

/** Creates a new MockTokenizerFactory */
public MockTokenFilterFactory(Map<String, String> args) {
    super(args);/* w w w. j a  va2s  . c o m*/
    String stopset = get(args, "stopset", Arrays.asList("english", "empty"), null, false);
    String stopregex = get(args, "stopregex");
    if (null != stopset) {
        if (null != stopregex) {
            throw new IllegalArgumentException("Parameters stopset and stopregex cannot both be specified.");
        }
        if ("english".equalsIgnoreCase(stopset)) {
            filter = MockTokenFilter.ENGLISH_STOPSET;
        } else { // must be "empty"
            filter = MockTokenFilter.EMPTY_STOPSET;
        }
    } else if (null != stopregex) {
        RegExp regex = new RegExp(stopregex);
        filter = new CharacterRunAutomaton(regex.toAutomaton());
    } else {
        throw new IllegalArgumentException(
                "Configuration Error: either the 'stopset' or the 'stopregex' parameter must be specified.");
    }
    if (!args.isEmpty()) {
        throw new IllegalArgumentException("Unknown parameters: " + args);
    }
}

From source file:org.codelibs.elasticsearch.common.xcontent.support.XContentMapValues.java

License:Apache License

/**
 * Returns a function that filters a document map based on the given include and exclude rules.
 * @see #filter(Map, String[], String[]) for details
 *//* www  . j  a v  a  2 s. c om*/
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
    CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());

    CharacterRunAutomaton include;
    if (includes == null || includes.length == 0) {
        include = matchAllAutomaton;
    } else {
        Automaton includeA = Regex.simpleMatchToAutomaton(includes);
        includeA = makeMatchDotsInFieldNames(includeA);
        include = new CharacterRunAutomaton(includeA);
    }

    Automaton excludeA;
    if (excludes == null || excludes.length == 0) {
        excludeA = Automata.makeEmpty();
    } else {
        excludeA = Regex.simpleMatchToAutomaton(excludes);
        excludeA = makeMatchDotsInFieldNames(excludeA);
    }
    CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);

    // NOTE: We cannot use Operations.minus because of the special case that
    // we want all sub properties to match as soon as an object matches

    return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton);
}

From source file:org.easynet.resource.queryparser.QueryParserTestBase.java

License:Apache License

public void testBoost() throws Exception {
    CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on"));
    Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords);
    QueryParser qp = getParserConfig(oneStopAnalyzer);
    Query q = getQuery("on^1.0", qp);
    assertNotNull(q);/*from  w  ww.  j  a  va 2s .c  om*/
    q = getQuery("\"hello\"^2.0", qp);
    assertNotNull(q);
    assertEquals(q.getBoost(), (float) 2.0, (float) 0.5);
    q = getQuery("hello^2.0", qp);
    assertNotNull(q);
    assertEquals(q.getBoost(), (float) 2.0, (float) 0.5);
    q = getQuery("\"on\"^1.0", qp);
    assertNotNull(q);

    Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
    QueryParser qp2 = getParserConfig(a2);
    q = getQuery("the^3", qp2);
    // "the" is a stop word so the result is an empty query:
    assertNotNull(q);
    assertEquals("", q.toString());
    assertEquals(1.0f, q.getBoost(), 0.01f);
}

From source file:org.easynet.resource.queryparser.QueryParserTestBase.java

License:Apache License

public void testStopwords() throws Exception {
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    QueryParser qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
    Query result = getQuery("field:the OR field:foo", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
    assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 0,
            ((BooleanQuery) result).clauses().size() == 0);
    result = getQuery("field:woo OR field:the", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a TermQuery", result instanceof TermQuery);
    result = getQuery("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
    if (VERBOSE)/*  w  w w.  j a va2  s .  c  o  m*/
        System.out.println("Result: " + result);
    assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 2,
            ((BooleanQuery) result).clauses().size() == 2);
}

From source file:org.easynet.resource.queryparser.QueryParserTestBase.java

License:Apache License

public void testPhraseQueryPositionIncrements() throws Exception {
    CharacterRunAutomaton stopStopList = new CharacterRunAutomaton(
            new RegExp("[sS][tT][oO][pP]").toAutomaton());

    QueryParser qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));

    qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
    qp.setEnablePositionIncrements(true);

    PhraseQuery phraseQuery = new PhraseQuery();
    phraseQuery.add(new Term("field", "1"));
    phraseQuery.add(new Term("field", "2"), 2);
    assertEquals(phraseQuery, getQuery("\"1 stop 2\"", qp));
}

From source file:org.elasticsearch.index.reindex.TransportReindexAction.java

License:Apache License

/**
 * Build the {@link CharacterRunAutomaton} that represents the reindex-from-remote whitelist and make sure that it doesn't whitelist
 * the world./*from   w ww  .  j  ava2 s  .  c  o  m*/
 */
static CharacterRunAutomaton buildRemoteWhitelist(List<String> whitelist) {
    if (whitelist.isEmpty()) {
        return new CharacterRunAutomaton(Automata.makeEmpty());
    }
    Automaton automaton = Regex.simpleMatchToAutomaton(whitelist.toArray(Strings.EMPTY_ARRAY));
    automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
    if (Operations.isTotal(automaton)) {
        throw new IllegalArgumentException("Refusing to start because whitelist " + whitelist
                + " accepts all addresses. "
                + "This would allow users to reindex-from-remote any URL they like effectively having Elasticsearch make HTTP GETs "
                + "for them.");
    }
    return new CharacterRunAutomaton(automaton);
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test filtering two string fields//from  ww w .  j a v a  2s  .c o m
 */
public void testIndexed() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);

    // add document with 2 fields
    Document doc = new Document();
    doc.add(new StringField("fieldA", "test", Field.Store.NO));
    doc.add(new StringField("fieldB", "test", Field.Store.NO));
    iw.addDocument(doc);

    // open reader
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw),
            new CharacterRunAutomaton(Automata.makeString("fieldA")));

    // see only one field
    LeafReader segmentReader = ir.leaves().get(0).reader();
    Set<String> seenFields = new HashSet<>();
    for (FieldInfo info : segmentReader.getFieldInfos()) {
        seenFields.add(info.name);
    }
    assertEquals(Collections.singleton("fieldA"), seenFields);
    assertNotNull(segmentReader.terms("fieldA"));
    assertNull(segmentReader.terms("fieldB"));

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}