Example usage for org.apache.lucene.util BytesRefBuilder get

List of usage examples for org.apache.lucene.util BytesRefBuilder get

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRefBuilder get.

Prototype

public BytesRef get() 

Source Link

Document

Return a BytesRef that points to the internal content of this builder.

Usage

From source file:com.github.flaxsearch.util.BytesRefUtils.java

License:Apache License

private static Function<String, BytesRef> getDecoder(String type) {
    switch (type.toLowerCase(Locale.ROOT)) {
    case "base64":
        return s -> new BytesRef(Base64.getUrlDecoder().decode(s.getBytes(Charset.defaultCharset())));
    case "utf8":
        return BytesRef::new;
    case "int":
        return s -> {
            BytesRefBuilder builder = new BytesRefBuilder();
            LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(s), 0, builder);
            return builder.get();
        };//from w  w w.ja va2s . c  o m
    case "long":
        return s -> {
            BytesRefBuilder builder = new BytesRefBuilder();
            LegacyNumericUtils.longToPrefixCoded(Long.parseLong(s), 0, builder);
            return builder.get();
        };
    case "float":
        return s -> {
            BytesRefBuilder builder = new BytesRefBuilder();
            LegacyNumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0,
                    builder);
            return builder.get();
        };
    case "double":
        return s -> {
            BytesRefBuilder builder = new BytesRefBuilder();
            LegacyNumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0,
                    builder);
            return builder.get();
        };
    default:
        throw new IllegalArgumentException("Unknown decoder type: " + type);
    }
}

From source file:com.google.gerrit.lucene.QueryBuilder.java

License:Apache License

private static Term intTerm(String name, int value) {
    BytesRefBuilder builder = new BytesRefBuilder();
    NumericUtils.intToPrefixCodedBytes(value, 0, builder);
    return new Term(name, builder.get());
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }// w  w w. jav  a 2 s  . c om
    fst = fstBuilder.finish();
}

From source file:com.querydsl.lucene5.LuceneSerializer.java

License:Apache License

private BytesRef convertNumber(Number number) {
    if (Integer.class.isInstance(number) || Byte.class.isInstance(number) || Short.class.isInstance(number)) {
        BytesRefBuilder ref = new BytesRefBuilder();
        NumericUtils.intToPrefixCoded(number.intValue(), 0, ref);
        return ref.get();
    } else if (Double.class.isInstance(number) || BigDecimal.class.isInstance(number)) {
        BytesRefBuilder ref = new BytesRefBuilder();
        long l = NumericUtils.doubleToSortableLong(number.doubleValue());
        NumericUtils.longToPrefixCoded(l, 0, ref);
        return ref.get();
    } else if (Long.class.isInstance(number) || BigInteger.class.isInstance(number)) {
        BytesRefBuilder ref = new BytesRefBuilder();
        NumericUtils.longToPrefixCoded(number.longValue(), 0, ref);
        return ref.get();
    } else if (Float.class.isInstance(number)) {
        BytesRefBuilder ref = new BytesRefBuilder();
        int i = NumericUtils.floatToSortableInt(number.floatValue());
        NumericUtils.intToPrefixCoded(i, 0, ref);
        return ref.get();
    } else {/*  ww w.  j  a va2  s  .c om*/
        throw new IllegalArgumentException("Unsupported numeric type " + number.getClass().getName());
    }
}

From source file:com.stratio.cassandra.lucene.key.TokenMapper.java

License:Apache License

/**
 * Returns the {@link BytesRef} indexing value of the specified Murmur3 partitioning {@link Token}.
 *
 * @param token a Murmur3 token/* w ww  .ja  v  a 2s .co m*/
 * @return the {@code token}'s indexing value
 */
private static BytesRef bytesRef(Token token) {
    Long value = value(token);
    BytesRefBuilder bytesRef = new BytesRefBuilder();
    NumericUtils.longToPrefixCoded(value, 0, bytesRef);
    return bytesRef.get();
}

From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java

License:Apache License

@Override
public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim)
        throws IOException {
    TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val);
    try {// ww w.j av  a  2s. co  m
        ts.reset();
        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
        String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE;
        while (ts.incrementToken()) {
            if (matchType.equals(typeAtt.type())) {
                BytesRefBuilder ret = new BytesRefBuilder();
                ret.copyChars(termAtt.toString());
                if (!strict || appendExtraDelim) {
                    ret.append(delimBytes, 0, delimBytes.length);
                }
                return ret.get();
            }
        }
        return new BytesRef(BytesRef.EMPTY_BYTES);
    } finally {
        ts.close();
    }
}

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * //  ww  w  .  j a  v a2s .c  o  m
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but its a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *//from   w ww  .  ja  va2  s .c  o  m
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      File unsorted = File.createTempFile("unsorted", "dat", tempDir);
      ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
      boolean success = false;
      try {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
          success = true;
      } finally {
          if (success) {
              IOUtils.close(writer);
          } else {
              IOUtils.closeWhileHandlingException(writer);
          }
      }
      File sorted = File.createTempFile("sorted", "dat", tempDir);

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      sorter.sort(unsorted, sorted);
      unsorted.delete();

      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      BytesRefBuilder scratchLine = new BytesRefBuilder();

      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently

      String currentEntry = null;
      IntsRefBuilder currentOrds = new IntsRefBuilder();

      String line;
      while (reader.read(scratchLine)) {
          line = scratchLine.get().utf8ToString();
          String entry;
          char wordForm[];
          int end;

          int flagSep = line.indexOf(FLAG_SEPARATOR);
          if (flagSep == -1) {
              wordForm = NOFLAGS;
              end = line.indexOf(MORPH_SEPARATOR);
              entry = line.substring(0, end);
          } else {
              end = line.indexOf(MORPH_SEPARATOR);
              String flagPart = line.substring(flagSep + 1, end);
              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              wordForm = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(wordForm);
              entry = line.substring(0, flagSep);
          }
          // we possibly have morphological data
          int stemExceptionID = 0;
          if (hasStemExceptions && end + 1 < line.length()) {
              String stemException = parseStemException(line.substring(end + 1));
              if (stemException != null) {
                  if (stemExceptionCount == stemExceptions.length) {
                      int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                              RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                      stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                  }
                  stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                  stemExceptions[stemExceptionCount++] = stemException;
              }
          }

          int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
          if (cmp < 0) {
              throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
          } else {
              encodeFlags(flagsScratch, wordForm);
              int ord = flagLookup.add(flagsScratch.get());
              if (ord < 0) {
                  // already exists in our hash
                  ord = (-ord) - 1;
              }
              // finalize current entry, and switch "current" if necessary
              if (cmp > 0 && currentEntry != null) {
                  Util.toUTF32(currentEntry, scratchInts);
                  words.add(scratchInts.get(), currentOrds.get());
              }
              // swap current
              if (cmp > 0 || currentEntry == null) {
                  currentEntry = entry;
                  currentOrds = new IntsRefBuilder(); // must be this way
              }
              if (hasStemExceptions) {
                  currentOrds.append(ord);
                  currentOrds.append(stemExceptionID);
              } else {
                  currentOrds.append(ord);
              }
          }
      }

      // finalize last entry
      Util.toUTF32(currentEntry, scratchInts);
      words.add(scratchInts.get(), currentOrds.get());

      reader.close();
      sorted.delete();
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * //w  w  w  . j a  va 2 s.  c o  m
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but it's a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   */*  w ww  .  j  a v  a 2  s. co m*/
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat");
      try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
      }
      Path sorted = Files.createTempFile(tempDir, "sorted", "dat");

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      boolean success = false;
      try {
          sorter.sort(unsorted, sorted);
          success = true;
      } finally {
          if (success) {
              Files.delete(unsorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(unsorted);
          }
      }

      boolean success2 = false;
      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      try {
          BytesRefBuilder scratchLine = new BytesRefBuilder();

          // TODO: the flags themselves can be double-chars (long) or also numeric
          // either way the trick is to encode them as char... but they must be parsed differently

          String currentEntry = null;
          IntsRefBuilder currentOrds = new IntsRefBuilder();

          String line;
          while (reader.read(scratchLine)) {
              line = scratchLine.get().utf8ToString();
              String entry;
              char wordForm[];
              int end;

              int flagSep = line.indexOf(FLAG_SEPARATOR);
              if (flagSep == -1) {
                  wordForm = NOFLAGS;
                  end = line.indexOf(MORPH_SEPARATOR);
                  entry = line.substring(0, end);
              } else {
                  end = line.indexOf(MORPH_SEPARATOR);
                  String flagPart = line.substring(flagSep + 1, end);
                  if (aliasCount > 0) {
                      flagPart = getAliasValue(Integer.parseInt(flagPart));
                  }

                  wordForm = flagParsingStrategy.parseFlags(flagPart);
                  Arrays.sort(wordForm);
                  entry = line.substring(0, flagSep);
              }
              // we possibly have morphological data
              int stemExceptionID = 0;
              if (hasStemExceptions && end + 1 < line.length()) {
                  String stemException = parseStemException(line.substring(end + 1));
                  if (stemException != null) {
                      if (stemExceptionCount == stemExceptions.length) {
                          int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                                  RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                          stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                      }
                      stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                      stemExceptions[stemExceptionCount++] = stemException;
                  }
              }

              int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
              if (cmp < 0) {
                  throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
              } else {
                  encodeFlags(flagsScratch, wordForm);
                  int ord = flagLookup.add(flagsScratch.get());
                  if (ord < 0) {
                      // already exists in our hash
                      ord = (-ord) - 1;
                  }
                  // finalize current entry, and switch "current" if necessary
                  if (cmp > 0 && currentEntry != null) {
                      Util.toUTF32(currentEntry, scratchInts);
                      words.add(scratchInts.get(), currentOrds.get());
                  }
                  // swap current
                  if (cmp > 0 || currentEntry == null) {
                      currentEntry = entry;
                      currentOrds = new IntsRefBuilder(); // must be this way
                  }
                  if (hasStemExceptions) {
                      currentOrds.append(ord);
                      currentOrds.append(stemExceptionID);
                  } else {
                      currentOrds.append(ord);
                  }
              }
          }

          // finalize last entry
          Util.toUTF32(currentEntry, scratchInts);
          words.add(scratchInts.get(), currentOrds.get());
          success2 = true;
      } finally {
          IOUtils.closeWhileHandlingException(reader);
          if (success2) {
              Files.delete(sorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(sorted);
          }
      }
  }