Example usage for org.apache.lucene.store ByteArrayDataOutput ByteArrayDataOutput

List of usage examples for org.apache.lucene.store ByteArrayDataOutput ByteArrayDataOutput

Introduction

In this page you can find the example usage for org.apache.lucene.store ByteArrayDataOutput ByteArrayDataOutput.

Prototype

public ByteArrayDataOutput(byte[] bytes, int offset, int len) 

Source Link

Usage

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * /*w w w.j  a  v  a2  s  .c  o  m*/
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but its a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * // www.j  av  a2  s .c o m
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but it's a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Parses a specific affix rule putting the result into the provided affix map
 * //from  w  w  w .  j a va2s  . co m
 * @param affixes Map where the result of the parsing will be put
 * @param header Header line of the affix rule
 * @param reader BufferedReader to read the content of the rule from
 * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
 *                         pattern
 * @param seenPatterns map from condition -> index of patterns, for deduplication.
 * @throws IOException Can be thrown while reading the rule
 */
private void parseAffix(TreeMap<String, List<Character>> affixes, String header, LineNumberReader reader,
        String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
        throws IOException, ParseException {

    BytesRef scratch = new BytesRef();
    StringBuilder sb = new StringBuilder();
    String args[] = header.split("\\s+");

    boolean crossProduct = args[2].equals("Y");
    boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

    int numLines = Integer.parseInt(args[3]);
    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

    for (int i = 0; i < numLines; i++) {
        assert affixWriter.getPosition() == currentAffix << 3;
        String line = reader.readLine();
        String ruleArgs[] = line.split("\\s+");

        // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
        // condition is optional
        if (ruleArgs.length < 4) {
            throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                    reader.getLineNumber());
        }

        char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
        String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
        String affixArg = ruleArgs[3];
        char appendFlags[] = null;

        int flagSep = affixArg.lastIndexOf('/');
        if (flagSep != -1) {
            String flagPart = affixArg.substring(flagSep + 1);
            affixArg = affixArg.substring(0, flagSep);

            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }

            appendFlags = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(appendFlags);
            twoStageAffix = true;
        }

        // TODO: add test and fix zero-affix handling!

        String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
        // at least the gascon affix file has this issue
        if (condition.startsWith("[") && condition.indexOf(']') == -1) {
            condition = condition + "]";
        }
        // "dash hasn't got special meaning" (we must escape it)
        if (condition.indexOf('-') >= 0) {
            condition = escapeDash(condition);
        }

        final String regex;
        if (".".equals(condition)) {
            regex = ".*"; // Zero condition is indicated by dot
        } else if (condition.equals(strip)) {
            regex = ".*"; // TODO: optimize this better:
                          // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                          // but this is complicated...
        } else {
            regex = String.format(Locale.ROOT, conditionPattern, condition);
        }

        // deduplicate patterns
        Integer patternIndex = seenPatterns.get(regex);
        if (patternIndex == null) {
            patternIndex = patterns.size();
            if (patternIndex > Short.MAX_VALUE) {
                throw new UnsupportedOperationException(
                        "Too many patterns, please report this to dev@lucene.apache.org");
            }
            seenPatterns.put(regex, patternIndex);
            CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                    new RegExp(regex, RegExp.NONE).toAutomaton());
            patterns.add(pattern);
        }

        Integer stripOrd = seenStrips.get(strip);
        if (stripOrd == null) {
            stripOrd = seenStrips.size();
            seenStrips.put(strip, stripOrd);
            if (stripOrd > Character.MAX_VALUE) {
                throw new UnsupportedOperationException(
                        "Too many unique strips, please report this to dev@lucene.apache.org");
            }
        }

        if (appendFlags == null) {
            appendFlags = NOFLAGS;
        }

        encodeFlags(scratch, appendFlags);
        int appendFlagsOrd = flagLookup.add(scratch);
        if (appendFlagsOrd < 0) {
            // already exists in our hash
            appendFlagsOrd = (-appendFlagsOrd) - 1;
        } else if (appendFlagsOrd > Short.MAX_VALUE) {
            // this limit is probably flexible, but its a good sanity check too
            throw new UnsupportedOperationException(
                    "Too many unique append flags, please report this to dev@lucene.apache.org");
        }

        affixWriter.writeShort((short) flag);
        affixWriter.writeShort((short) stripOrd.intValue());
        // encode crossProduct into patternIndex
        int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
        affixWriter.writeShort((short) patternOrd);
        affixWriter.writeShort((short) appendFlagsOrd);

        if (needsInputCleaning) {
            CharSequence cleaned = cleanInput(affixArg, sb);
            affixArg = cleaned.toString();
        }

        if (isSuffix) {
            affixArg = new StringBuilder(affixArg).reverse().toString();
        }

        List<Character> list = affixes.get(affixArg);
        if (list == null) {
            list = new ArrayList<>();
            affixes.put(affixArg, list);
        }

        list.add((char) currentAffix);
        currentAffix++;
    }
}