org.apache.lucene.analysis.hunspell.Dictionary.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.lucene.analysis.hunspell.Dictionary.java

Source

  /*
   * Licensed to the Apache Software Foundation (ASF) under one or more
   * contributor license agreements.  See the NOTICE file distributed with
   * this work for additional information regarding copyright ownership.
   * The ASF licenses this file to You under the Apache License, Version 2.0
   * (the "License"); you may not use this file except in compliance with
   * the License.  You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  package org.apache.lucene.analysis.hunspell;

  import java.io.BufferedInputStream;
  import java.io.BufferedOutputStream;
  import java.io.BufferedReader;
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.InputStreamReader;
  import java.io.LineNumberReader;
  import java.io.OutputStream;
  import java.nio.charset.Charset;
  import java.nio.charset.CharsetDecoder;
  import java.nio.charset.CodingErrorAction;
  import java.nio.charset.StandardCharsets;
  import java.nio.file.Files;
  import java.nio.file.Path;
  import java.nio.file.Paths;
  import java.text.ParseException;
  import java.util.ArrayList;
  import java.util.Arrays;
  import java.util.Collections;
  import java.util.Comparator;
  import java.util.HashMap;
  import java.util.LinkedHashMap;
  import java.util.List;
  import java.util.Locale;
  import java.util.Map;
  import java.util.TreeMap;
  import java.util.regex.Matcher;
  import java.util.regex.Pattern;

  import org.apache.lucene.codecs.CodecUtil;
  import org.apache.lucene.store.ByteArrayDataOutput;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.store.IOContext;
  import org.apache.lucene.store.IndexOutput;
  import org.apache.lucene.util.ArrayUtil;
  import org.apache.lucene.util.BytesRef;
  import org.apache.lucene.util.BytesRefBuilder;
  import org.apache.lucene.util.BytesRefHash;
  import org.apache.lucene.util.CharsRef;
  import org.apache.lucene.util.IOUtils;
  import org.apache.lucene.util.IntsRef;
  import org.apache.lucene.util.IntsRefBuilder;
  import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
  import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
  import org.apache.lucene.util.OfflineSorter;
  import org.apache.lucene.util.automaton.CharacterRunAutomaton;
  import org.apache.lucene.util.automaton.RegExp;
  import org.apache.lucene.util.fst.Builder;
  import org.apache.lucene.util.fst.CharSequenceOutputs;
  import org.apache.lucene.util.fst.FST;
  import org.apache.lucene.util.fst.IntSequenceOutputs;
  import org.apache.lucene.util.fst.Outputs;
  import org.apache.lucene.util.fst.Util;

  /**
   * In-memory structure for the dictionary (.dic) and affix (.aff)
   * data of a hunspell dictionary.
   */
  public class Dictionary {

      static final char[] NOFLAGS = new char[0];

      private static final String ALIAS_KEY = "AF";
      private static final String MORPH_ALIAS_KEY = "AM";
      private static final String PREFIX_KEY = "PFX";
      private static final String SUFFIX_KEY = "SFX";
      private static final String FLAG_KEY = "FLAG";
      private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
      private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
      private static final String IGNORE_KEY = "IGNORE";
      private static final String ICONV_KEY = "ICONV";
      private static final String OCONV_KEY = "OCONV";
      private static final String FULLSTRIP_KEY = "FULLSTRIP";
      private static final String LANG_KEY = "LANG";
      private static final String KEEPCASE_KEY = "KEEPCASE";
      private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
      private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
      private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";

      private static final String NUM_FLAG_TYPE = "num";
      private static final String UTF8_FLAG_TYPE = "UTF-8";
      private static final String LONG_FLAG_TYPE = "long";

      // TODO: really for suffixes we should reverse the automaton and run them backwards
      private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
      private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

      FST<IntsRef> prefixes;
      FST<IntsRef> suffixes;

      // all condition checks used by prefixes and suffixes. these are typically re-used across
      // many affix stripping rules. so these are deduplicated, to save RAM.
      ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();

      // the entries in the .dic file, mapping to their set of flags.
      // the fst output is the ordinal list for flagLookup
      FST<IntsRef> words;
      // the list of unique flagsets (wordforms). theoretically huge, but practically
      // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
      BytesRefHash flagLookup = new BytesRefHash();

      // the list of unique strip affixes.
      char[] stripData;
      int[] stripOffsets;

      // 8 bytes per affix
      byte[] affixData = new byte[64];
      private int currentAffix = 0;

      private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy

      // AF entries
      private String[] aliases;
      private int aliasCount = 0;

      // AM entries
      private String[] morphAliases;
      private int morphAliasCount = 0;

      // st: morphological entries (either directly, or aliased from AM)
      private String[] stemExceptions = new String[8];
      private int stemExceptionCount = 0;
      // we set this during sorting, so we know to add an extra FST output.
      // when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions
      boolean hasStemExceptions;

      private final Path tempPath = getDefaultTempDir(); // TODO: make this configurable?

      boolean ignoreCase;
      boolean complexPrefixes;
      boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping

      int circumfix = -1; // circumfix flag, or -1 if one is not defined
      int keepcase = -1; // keepcase flag, or -1 if one is not defined
      int needaffix = -1; // needaffix flag, or -1 if one is not defined
      int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined

      // ignored characters (dictionary, affix, inputs)
      private char[] ignore;

      // FSTs used for ICONV/OCONV, output ord pointing to replacement text
      FST<CharsRef> iconv;
      FST<CharsRef> oconv;

      boolean needsInputCleaning;
      boolean needsOutputCleaning;

      // true if we can strip suffixes "down to nothing"
      boolean fullStrip;

      // language declaration of the dictionary
      String language;
      // true if case algorithms should use alternate (Turkish/Azeri) mapping
      boolean alternateCasing;

      /**
       * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
       * and dictionary files.
       * You have to close the provided InputStreams yourself.
       *
       * @param tempDir Directory to use for offline sorting
       * @param tempFileNamePrefix prefix to use to generate temp file names
       * @param affix InputStream for reading the hunspell affix file (won't be closed).
       * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
       * @throws IOException Can be thrown while reading from the InputStreams
       * @throws ParseException Can be thrown if the content of the files does not meet expected formats
       */
      public Dictionary(Directory tempDir, String tempFileNamePrefix, InputStream affix, InputStream dictionary)
              throws IOException, ParseException {
          this(tempDir, tempFileNamePrefix, affix, Collections.singletonList(dictionary), false);
      }

      /**
       * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
       * and dictionary files.
       * You have to close the provided InputStreams yourself.
       *
       * @param tempDir Directory to use for offline sorting
       * @param tempFileNamePrefix prefix to use to generate temp file names
       * @param affix InputStream for reading the hunspell affix file (won't be closed).
       * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
       * @throws IOException Can be thrown while reading from the InputStreams
       * @throws ParseException Can be thrown if the content of the files does not meet expected formats
       */
      public Dictionary(Directory tempDir, String tempFileNamePrefix, InputStream affix,
              List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
          this.ignoreCase = ignoreCase;
          this.needsInputCleaning = ignoreCase;
          this.needsOutputCleaning = false; // set if we have an OCONV
          flagLookup.add(new BytesRef()); // no flags -> ord 0

          Path aff = Files.createTempFile(tempPath, "affix", "aff");
          OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
          InputStream aff1 = null;
          InputStream aff2 = null;
          boolean success = false;
          try {
              // copy contents of affix stream to temp file
              final byte[] buffer = new byte[1024 * 8];
              int len;
              while ((len = affix.read(buffer)) > 0) {
                  out.write(buffer, 0, len);
              }
              out.close();

              // pass 1: get encoding
              aff1 = new BufferedInputStream(Files.newInputStream(aff));
              String encoding = getDictionaryEncoding(aff1);

              // pass 2: parse affixes
              CharsetDecoder decoder = getJavaEncoding(encoding);
              aff2 = new BufferedInputStream(Files.newInputStream(aff));
              readAffixFile(aff2, decoder);

              // read dictionary entries
              IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
              Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
              readDictionaryFiles(tempDir, tempFileNamePrefix, dictionaries, decoder, b);
              words = b.finish();
              aliases = null; // no longer needed
              morphAliases = null; // no longer needed
              success = true;
          } finally {
              IOUtils.closeWhileHandlingException(out, aff1, aff2);
              if (success) {
                  Files.delete(aff);
              } else {
                  IOUtils.deleteFilesIgnoringExceptions(aff);
              }
          }
      }

      /**
       * Looks up Hunspell word forms from the dictionary
       */
      IntsRef lookupWord(char word[], int offset, int length) {
          return lookup(words, word, offset, length);
      }

      // only for testing
      IntsRef lookupPrefix(char word[], int offset, int length) {
          return lookup(prefixes, word, offset, length);
      }

      // only for testing
      IntsRef lookupSuffix(char word[], int offset, int length) {
          return lookup(suffixes, word, offset, length);
      }

      IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
          if (fst == null) {
              return null;
          }
          final FST.BytesReader bytesReader = fst.getBytesReader();
          final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
          // Accumulate output as we go
          final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
          IntsRef output = NO_OUTPUT;

          int l = offset + length;
          try {
              for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
                  cp = Character.codePointAt(word, i, l);
                  if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
                      return null;
                  } else if (arc.output != NO_OUTPUT) {
                      output = fst.outputs.add(output, arc.output);
                  }
              }
              if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
                  return null;
              } else if (arc.output != NO_OUTPUT) {
                  return fst.outputs.add(output, arc.output);
              } else {
                  return output;
              }
          } catch (IOException bogus) {
              throw new RuntimeException(bogus);
          }
      }

      /**
       * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
       *
       * @param affixStream InputStream to read the content of the affix file from
       * @param decoder CharsetDecoder to decode the content of the file
       * @throws IOException Can be thrown while reading from the InputStream
       */
      private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
          TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
          TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
          Map<String, Integer> seenPatterns = new HashMap<>();

          // zero condition -> 0 ord
          seenPatterns.put(".*", 0);
          patterns.add(null);

          // zero strip -> 0 ord
          Map<String, Integer> seenStrips = new LinkedHashMap<>();
          seenStrips.put("", 0);

          LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
          String line = null;
          while ((line = reader.readLine()) != null) {
              // ignore any BOM marker on first line
              if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
                  line = line.substring(1);
              }
              if (line.startsWith(ALIAS_KEY)) {
                  parseAlias(line);
              } else if (line.startsWith(MORPH_ALIAS_KEY)) {
                  parseMorphAlias(line);
              } else if (line.startsWith(PREFIX_KEY)) {
                  parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
              } else if (line.startsWith(SUFFIX_KEY)) {
                  parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
              } else if (line.startsWith(FLAG_KEY)) {
                  // Assume that the FLAG line comes before any prefix or suffixes
                  // Store the strategy so it can be used when parsing the dic file
                  flagParsingStrategy = getFlagParsingStrategy(line);
              } else if (line.equals(COMPLEXPREFIXES_KEY)) {
                  complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
              } else if (line.startsWith(CIRCUMFIX_KEY)) {
                  String parts[] = line.split("\\s+");
                  if (parts.length != 2) {
                      throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
                  }
                  circumfix = flagParsingStrategy.parseFlag(parts[1]);
              } else if (line.startsWith(KEEPCASE_KEY)) {
                  String parts[] = line.split("\\s+");
                  if (parts.length != 2) {
                      throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
                  }
                  keepcase = flagParsingStrategy.parseFlag(parts[1]);
              } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
                  String parts[] = line.split("\\s+");
                  if (parts.length != 2) {
                      throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
                  }
                  needaffix = flagParsingStrategy.parseFlag(parts[1]);
              } else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
                  String parts[] = line.split("\\s+");
                  if (parts.length != 2) {
                      throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
                  }
                  onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
              } else if (line.startsWith(IGNORE_KEY)) {
                  String parts[] = line.split("\\s+");
                  if (parts.length != 2) {
                      throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
                  }
                  ignore = parts[1].toCharArray();
                  Arrays.sort(ignore);
                  needsInputCleaning = true;
              } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
                  String parts[] = line.split("\\s+");
                  String type = parts[0];
                  if (parts.length != 2) {
                      throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
                  }
                  int num = Integer.parseInt(parts[1]);
                  FST<CharsRef> res = parseConversions(reader, num);
                  if (type.equals("ICONV")) {
                      iconv = res;
                      needsInputCleaning |= iconv != null;
                  } else {
                      oconv = res;
                      needsOutputCleaning |= oconv != null;
                  }
              } else if (line.startsWith(FULLSTRIP_KEY)) {
                  fullStrip = true;
              } else if (line.startsWith(LANG_KEY)) {
                  language = line.substring(LANG_KEY.length()).trim();
                  alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
              }
          }

          this.prefixes = affixFST(prefixes);
          this.suffixes = affixFST(suffixes);

          int totalChars = 0;
          for (String strip : seenStrips.keySet()) {
              totalChars += strip.length();
          }
          stripData = new char[totalChars];
          stripOffsets = new int[seenStrips.size() + 1];
          int currentOffset = 0;
          int currentIndex = 0;
          for (String strip : seenStrips.keySet()) {
              stripOffsets[currentIndex++] = currentOffset;
              strip.getChars(0, strip.length(), stripData, currentOffset);
              currentOffset += strip.length();
          }
          assert currentIndex == seenStrips.size();
          stripOffsets[currentIndex] = currentOffset;
      }

      private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
          IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
          Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
          IntsRefBuilder scratch = new IntsRefBuilder();
          for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
              Util.toUTF32(entry.getKey(), scratch);
              List<Integer> entries = entry.getValue();
              IntsRef output = new IntsRef(entries.size());
              for (Integer c : entries) {
                  output.ints[output.length++] = c;
              }
              builder.add(scratch.get(), output);
          }
          return builder.finish();
      }

      static String escapeDash(String re) {
          // we have to be careful, even though dash doesn't have a special meaning,
          // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
          StringBuilder escaped = new StringBuilder();
          for (int i = 0; i < re.length(); i++) {
              char c = re.charAt(i);
              if (c == '-') {
                  escaped.append("\\-");
              } else {
                  escaped.append(c);
                  if (c == '\\' && i + 1 < re.length()) {
                      escaped.append(re.charAt(i + 1));
                      i++;
                  }
              }
          }
          return escaped.toString();
      }

      /**
       * Parses a specific affix rule putting the result into the provided affix map
       * 
       * @param affixes Map where the result of the parsing will be put
       * @param header Header line of the affix rule
       * @param reader BufferedReader to read the content of the rule from
       * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
       *                         pattern
       * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
       * @throws IOException Can be thrown while reading the rule
       */
      private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
              String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
              throws IOException, ParseException {

          BytesRefBuilder scratch = new BytesRefBuilder();
          StringBuilder sb = new StringBuilder();
          String args[] = header.split("\\s+");

          boolean crossProduct = args[2].equals("Y");
          boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

          int numLines = Integer.parseInt(args[3]);
          affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
          ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

          for (int i = 0; i < numLines; i++) {
              assert affixWriter.getPosition() == currentAffix << 3;
              String line = reader.readLine();
              String ruleArgs[] = line.split("\\s+");

              // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
              // condition is optional
              if (ruleArgs.length < 4) {
                  throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                          reader.getLineNumber());
              }

              char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
              String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
              String affixArg = ruleArgs[3];
              char appendFlags[] = null;

              // first: parse continuation classes out of affix
              int flagSep = affixArg.lastIndexOf('/');
              if (flagSep != -1) {
                  String flagPart = affixArg.substring(flagSep + 1);
                  affixArg = affixArg.substring(0, flagSep);

                  if (aliasCount > 0) {
                      flagPart = getAliasValue(Integer.parseInt(flagPart));
                  }

                  appendFlags = flagParsingStrategy.parseFlags(flagPart);
                  Arrays.sort(appendFlags);
                  twoStageAffix = true;
              }
              // zero affix -> empty string
              if ("0".equals(affixArg)) {
                  affixArg = "";
              }

              String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
              // at least the gascon affix file has this issue
              if (condition.startsWith("[") && condition.indexOf(']') == -1) {
                  condition = condition + "]";
              }
              // "dash hasn't got special meaning" (we must escape it)
              if (condition.indexOf('-') >= 0) {
                  condition = escapeDash(condition);
              }

              final String regex;
              if (".".equals(condition)) {
                  regex = ".*"; // Zero condition is indicated by dot
              } else if (condition.equals(strip)) {
                  regex = ".*"; // TODO: optimize this better:
                                // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                                // but this is complicated...
              } else {
                  regex = String.format(Locale.ROOT, conditionPattern, condition);
              }

              // deduplicate patterns
              Integer patternIndex = seenPatterns.get(regex);
              if (patternIndex == null) {
                  patternIndex = patterns.size();
                  if (patternIndex > Short.MAX_VALUE) {
                      throw new UnsupportedOperationException(
                              "Too many patterns, please report this to dev@lucene.apache.org");
                  }
                  seenPatterns.put(regex, patternIndex);
                  CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                          new RegExp(regex, RegExp.NONE).toAutomaton());
                  patterns.add(pattern);
              }

              Integer stripOrd = seenStrips.get(strip);
              if (stripOrd == null) {
                  stripOrd = seenStrips.size();
                  seenStrips.put(strip, stripOrd);
                  if (stripOrd > Character.MAX_VALUE) {
                      throw new UnsupportedOperationException(
                              "Too many unique strips, please report this to dev@lucene.apache.org");
                  }
              }

              if (appendFlags == null) {
                  appendFlags = NOFLAGS;
              }

              encodeFlags(scratch, appendFlags);
              int appendFlagsOrd = flagLookup.add(scratch.get());
              if (appendFlagsOrd < 0) {
                  // already exists in our hash
                  appendFlagsOrd = (-appendFlagsOrd) - 1;
              } else if (appendFlagsOrd > Short.MAX_VALUE) {
                  // this limit is probably flexible, but it's a good sanity check too
                  throw new UnsupportedOperationException(
                          "Too many unique append flags, please report this to dev@lucene.apache.org");
              }

              affixWriter.writeShort((short) flag);
              affixWriter.writeShort((short) stripOrd.intValue());
              // encode crossProduct into patternIndex
              int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
              affixWriter.writeShort((short) patternOrd);
              affixWriter.writeShort((short) appendFlagsOrd);

              if (needsInputCleaning) {
                  CharSequence cleaned = cleanInput(affixArg, sb);
                  affixArg = cleaned.toString();
              }

              if (isSuffix) {
                  affixArg = new StringBuilder(affixArg).reverse().toString();
              }

              List<Integer> list = affixes.get(affixArg);
              if (list == null) {
                  list = new ArrayList<>();
                  affixes.put(affixArg, list);
              }
              list.add(currentAffix);
              currentAffix++;
          }
      }

      private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
          Map<String, String> mappings = new TreeMap<>();

          for (int i = 0; i < num; i++) {
              String line = reader.readLine();
              String parts[] = line.split("\\s+");
              if (parts.length != 3) {
                  throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
              }
              if (mappings.put(parts[1], parts[2]) != null) {
                  throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
              }
          }

          Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
          Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
          IntsRefBuilder scratchInts = new IntsRefBuilder();
          for (Map.Entry<String, String> entry : mappings.entrySet()) {
              Util.toUTF16(entry.getKey(), scratchInts);
              builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
          }

          return builder.finish();
      }

      /** pattern accepts optional BOM + SET + any whitespace */
      final static Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");

      /**
       * Parses the encoding specified in the affix file readable through the provided InputStream
       *
       * @param affix InputStream for reading the affix file
       * @return Encoding specified in the affix file
       * @throws IOException Can be thrown while reading from the InputStream
       * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
       */
      static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
          final StringBuilder encoding = new StringBuilder();
          for (;;) {
              encoding.setLength(0);
              int ch;
              while ((ch = affix.read()) >= 0) {
                  if (ch == '\n') {
                      break;
                  }
                  if (ch != '\r') {
                      encoding.append((char) ch);
                  }
              }
              if (encoding.length() == 0 || encoding.charAt(0) == '#' ||
              // this test only at the end as ineffective but would allow lines only containing spaces:
                      encoding.toString().trim().length() == 0) {
                  if (ch < 0) {
                      throw new ParseException("Unexpected end of affix file.", 0);
                  }
                  continue;
              }
              Matcher matcher = ENCODING_PATTERN.matcher(encoding);
              if (matcher.find()) {
                  int last = matcher.end();
                  return encoding.substring(last).trim();
              }
          }
      }

      static final Map<String, String> CHARSET_ALIASES;
      static {
          Map<String, String> m = new HashMap<>();
          m.put("microsoft-cp1251", "windows-1251");
          m.put("TIS620-2533", "TIS-620");
          CHARSET_ALIASES = Collections.unmodifiableMap(m);
      }

      /**
       * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
       * MICROSOFT-CP1251 etc are allowed...
       *
       * @param encoding Encoding to retrieve the CharsetDecoder for
       * @return CharSetDecoder for the given encoding
       */
      private CharsetDecoder getJavaEncoding(String encoding) {
          if ("ISO8859-14".equals(encoding)) {
              return new ISO8859_14Decoder();
          }
          String canon = CHARSET_ALIASES.get(encoding);
          if (canon != null) {
              encoding = canon;
          }
          Charset charset = Charset.forName(encoding);
          return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
      }

      /**
       * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
       *
       * @param flagLine Line containing the flag information
       * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
       */
      static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
          String parts[] = flagLine.split("\\s+");
          if (parts.length != 2) {
              throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
          }
          String flagType = parts[1];

          if (NUM_FLAG_TYPE.equals(flagType)) {
              return new NumFlagParsingStrategy();
          } else if (UTF8_FLAG_TYPE.equals(flagType)) {
              return new SimpleFlagParsingStrategy();
          } else if (LONG_FLAG_TYPE.equals(flagType)) {
              return new DoubleASCIIFlagParsingStrategy();
          }

          throw new IllegalArgumentException("Unknown flag type: " + flagType);
      }

      final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
      final char MORPH_SEPARATOR = 0x1e; // separator for boundary of entry (may be followed by morph data)

      String unescapeEntry(String entry) {
          StringBuilder sb = new StringBuilder();
          int end = morphBoundary(entry);
          for (int i = 0; i < end; i++) {
              char ch = entry.charAt(i);
              if (ch == '\\' && i + 1 < entry.length()) {
                  sb.append(entry.charAt(i + 1));
                  i++;
              } else if (ch == '/') {
                  sb.append(FLAG_SEPARATOR);
              } else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
                  // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
              } else {
                  sb.append(ch);
              }
          }
          sb.append(MORPH_SEPARATOR);
          if (end < entry.length()) {
              for (int i = end; i < entry.length(); i++) {
                  char c = entry.charAt(i);
                  if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
                      // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
                  } else {
                      sb.append(c);
                  }
              }
          }
          return sb.toString();
      }

      static int morphBoundary(String line) {
          int end = indexOfSpaceOrTab(line, 0);
          if (end == -1) {
              return line.length();
          }
          while (end >= 0 && end < line.length()) {
              if (line.charAt(end) == '\t' || end + 3 < line.length() && Character.isLetter(line.charAt(end + 1))
                      && Character.isLetter(line.charAt(end + 2)) && line.charAt(end + 3) == ':') {
                  break;
              }
              end = indexOfSpaceOrTab(line, end + 1);
          }
          if (end == -1) {
              return line.length();
          }
          return end;
      }

      static int indexOfSpaceOrTab(String text, int start) {
          int pos1 = text.indexOf('\t', start);
          int pos2 = text.indexOf(' ', start);
          if (pos1 >= 0 && pos2 >= 0) {
              return Math.min(pos1, pos2);
          } else {
              return Math.max(pos1, pos2);
          }
      }

      /**
       * Reads the dictionary file through the provided InputStreams, building up the words map
       *
       * @param dictionaries InputStreams to read the dictionary file through
       * @param decoder CharsetDecoder used to decode the contents of the file
       * @throws IOException Can be thrown while reading from the file
       */
      private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries,
              CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
          BytesRefBuilder flagsScratch = new BytesRefBuilder();
          IntsRefBuilder scratchInts = new IntsRefBuilder();

          StringBuilder sb = new StringBuilder();

          IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
          try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
              for (InputStream dictionary : dictionaries) {
                  BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
                  String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

                  while ((line = lines.readLine()) != null) {
                      // wild and unpredictable code comment rules
                      if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                              || line.charAt(0) == '\t') {
                          continue;
                      }
                      line = unescapeEntry(line);
                      // if we havent seen any stem exceptions, try to parse one
                      if (hasStemExceptions == false) {
                          int morphStart = line.indexOf(MORPH_SEPARATOR);
                          if (morphStart >= 0 && morphStart < line.length()) {
                              hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                          }
                      }
                      if (needsInputCleaning) {
                          int flagSep = line.indexOf(FLAG_SEPARATOR);
                          if (flagSep == -1) {
                              flagSep = line.indexOf(MORPH_SEPARATOR);
                          }
                          if (flagSep == -1) {
                              CharSequence cleansed = cleanInput(line, sb);
                              writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                          } else {
                              String text = line.substring(0, flagSep);
                              CharSequence cleansed = cleanInput(text, sb);
                              if (cleansed != sb) {
                                  sb.setLength(0);
                                  sb.append(cleansed);
                              }
                              sb.append(line.substring(flagSep));
                              writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                          }
                      } else {
                          writer.write(line.getBytes(StandardCharsets.UTF_8));
                      }
                  }
              }
              CodecUtil.writeFooter(unsorted);
          }

          OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {
              BytesRef scratch1 = new BytesRef();
              BytesRef scratch2 = new BytesRef();

              @Override
              public int compare(BytesRef o1, BytesRef o2) {
                  scratch1.bytes = o1.bytes;
                  scratch1.offset = o1.offset;
                  scratch1.length = o1.length;

                  for (int i = scratch1.length - 1; i >= 0; i--) {
                      if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                              || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                          scratch1.length = i;
                          break;
                      }
                  }

                  scratch2.bytes = o2.bytes;
                  scratch2.offset = o2.offset;
                  scratch2.length = o2.length;

                  for (int i = scratch2.length - 1; i >= 0; i--) {
                      if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                              || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                          scratch2.length = i;
                          break;
                      }
                  }

                  int cmp = scratch1.compareTo(scratch2);
                  if (cmp == 0) {
                      // tie break on whole row
                      return o1.compareTo(o2);
                  } else {
                      return cmp;
                  }
              }
          });

          String sorted;
          boolean success = false;
          try {
              sorted = sorter.sort(unsorted.getName());
              success = true;
          } finally {
              if (success) {
                  tempDir.deleteFile(unsorted.getName());
              } else {
                  IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
              }
          }

          boolean success2 = false;

          try (ByteSequencesReader reader = new ByteSequencesReader(
                  tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {

              // TODO: the flags themselves can be double-chars (long) or also numeric
              // either way the trick is to encode them as char... but they must be parsed differently

              String currentEntry = null;
              IntsRefBuilder currentOrds = new IntsRefBuilder();

              while (true) {
                  BytesRef scratch = reader.next();
                  if (scratch == null) {
                      break;
                  }

                  String line = scratch.utf8ToString();
                  String entry;
                  char wordForm[];
                  int end;

                  int flagSep = line.indexOf(FLAG_SEPARATOR);
                  if (flagSep == -1) {
                      wordForm = NOFLAGS;
                      end = line.indexOf(MORPH_SEPARATOR);
                      entry = line.substring(0, end);
                  } else {
                      end = line.indexOf(MORPH_SEPARATOR);
                      String flagPart = line.substring(flagSep + 1, end);
                      if (aliasCount > 0) {
                          flagPart = getAliasValue(Integer.parseInt(flagPart));
                      }

                      wordForm = flagParsingStrategy.parseFlags(flagPart);
                      Arrays.sort(wordForm);
                      entry = line.substring(0, flagSep);
                  }
                  // we possibly have morphological data
                  int stemExceptionID = 0;
                  if (hasStemExceptions && end + 1 < line.length()) {
                      String stemException = parseStemException(line.substring(end + 1));
                      if (stemException != null) {
                          stemExceptions = ArrayUtil.grow(stemExceptions, stemExceptionCount + 1);
                          stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                          stemExceptions[stemExceptionCount++] = stemException;
                      }
                  }

                  int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
                  if (cmp < 0) {
                      throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
                  } else {
                      encodeFlags(flagsScratch, wordForm);
                      int ord = flagLookup.add(flagsScratch.get());
                      if (ord < 0) {
                          // already exists in our hash
                          ord = (-ord) - 1;
                      }
                      // finalize current entry, and switch "current" if necessary
                      if (cmp > 0 && currentEntry != null) {
                          Util.toUTF32(currentEntry, scratchInts);
                          words.add(scratchInts.get(), currentOrds.get());
                      }
                      // swap current
                      if (cmp > 0 || currentEntry == null) {
                          currentEntry = entry;
                          currentOrds = new IntsRefBuilder(); // must be this way
                      }
                      if (hasStemExceptions) {
                          currentOrds.append(ord);
                          currentOrds.append(stemExceptionID);
                      } else {
                          currentOrds.append(ord);
                      }
                  }
              }

              // finalize last entry
              Util.toUTF32(currentEntry, scratchInts);
              words.add(scratchInts.get(), currentOrds.get());
              success2 = true;
          } finally {
              if (success2) {
                  tempDir.deleteFile(sorted);
              } else {
                  IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
              }
          }
      }

      static char[] decodeFlags(BytesRef b) {
          if (b.length == 0) {
              return CharsRef.EMPTY_CHARS;
          }
          int len = b.length >>> 1;
          char flags[] = new char[len];
          int upto = 0;
          int end = b.offset + b.length;
          for (int i = b.offset; i < end; i += 2) {
              flags[upto++] = (char) ((b.bytes[i] << 8) | (b.bytes[i + 1] & 0xff));
          }
          return flags;
      }

      static void encodeFlags(BytesRefBuilder b, char flags[]) {
          int len = flags.length << 1;
          b.grow(len);
          b.clear();
          for (int i = 0; i < flags.length; i++) {
              int flag = flags[i];
              b.append((byte) ((flag >> 8) & 0xff));
              b.append((byte) (flag & 0xff));
          }
      }

      private void parseAlias(String line) {
          String ruleArgs[] = line.split("\\s+");
          if (aliases == null) {
              //first line should be the aliases count
              final int count = Integer.parseInt(ruleArgs[1]);
              aliases = new String[count];
          } else {
              // an alias can map to no flags
              String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1];
              aliases[aliasCount++] = aliasValue;
          }
      }

      private String getAliasValue(int id) {
          try {
              return aliases[id - 1];
          } catch (IndexOutOfBoundsException ex) {
              throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
          }
      }

      String getStemException(int id) {
          return stemExceptions[id - 1];
      }

      private void parseMorphAlias(String line) {
          if (morphAliases == null) {
              //first line should be the aliases count
              final int count = Integer.parseInt(line.substring(3));
              morphAliases = new String[count];
          } else {
              String arg = line.substring(2); // leave the space
              morphAliases[morphAliasCount++] = arg;
          }
      }

      private String parseStemException(String morphData) {
          // first see if it's an alias
          if (morphAliasCount > 0) {
              try {
                  int alias = Integer.parseInt(morphData.trim());
                  morphData = morphAliases[alias - 1];
              } catch (NumberFormatException e) {
                  // fine
              }
          }
          // try to parse morph entry
          int index = morphData.indexOf(" st:");
          if (index < 0) {
              index = morphData.indexOf("\tst:");
          }
          if (index >= 0) {
              int endIndex = indexOfSpaceOrTab(morphData, index + 1);
              if (endIndex < 0) {
                  endIndex = morphData.length();
              }
              return morphData.substring(index + 4, endIndex);
          }
          return null;
      }

      /**
       * Abstraction of the process of parsing flags taken from the affix and dic files
       */
      static abstract class FlagParsingStrategy {

          /**
           * Parses the given String into a single flag
           *
           * @param rawFlag String to parse into a flag
           * @return Parsed flag
           */
          char parseFlag(String rawFlag) {
              char flags[] = parseFlags(rawFlag);
              if (flags.length != 1) {
                  throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
              }
              return flags[0];
          }

          /**
           * Parses the given String into multiple flags
           *
           * @param rawFlags String to parse into flags
           * @return Parsed flags
           */
          abstract char[] parseFlags(String rawFlags);
      }

      /**
       * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
       * Can be used with both the ASCII and UTF-8 flag types.
       */
      private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
          @Override
          public char[] parseFlags(String rawFlags) {
              return rawFlags.toCharArray();
          }
      }

      /**
       * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
       * of multiple flags, each number is separated by a comma.
       */
      private static class NumFlagParsingStrategy extends FlagParsingStrategy {
          @Override
          public char[] parseFlags(String rawFlags) {
              String[] rawFlagParts = rawFlags.trim().split(",");
              char[] flags = new char[rawFlagParts.length];
              int upto = 0;

              for (int i = 0; i < rawFlagParts.length; i++) {
                  // note, removing the trailing X/leading I for nepali... what is the rule here?! 
                  String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
                  // note, ignoring empty flags (this happens in danish, for example)
                  if (replacement.isEmpty()) {
                      continue;
                  }
                  flags[upto++] = (char) Integer.parseInt(replacement);
              }

              if (upto < flags.length) {
                  flags = ArrayUtil.copyOfSubArray(flags, 0, upto);
              }
              return flags;
          }
      }

      /**
       * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
       * must be combined into a single character.
       */
      private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {

          @Override
          public char[] parseFlags(String rawFlags) {
              if (rawFlags.length() == 0) {
                  return new char[0];
              }

              StringBuilder builder = new StringBuilder();
              if (rawFlags.length() % 2 == 1) {
                  throw new IllegalArgumentException(
                          "Invalid flags (should be even number of characters): " + rawFlags);
              }
              for (int i = 0; i < rawFlags.length(); i += 2) {
                  char f1 = rawFlags.charAt(i);
                  char f2 = rawFlags.charAt(i + 1);
                  if (f1 >= 256 || f2 >= 256) {
                      throw new IllegalArgumentException(
                              "Invalid flags (LONG flags must be double ASCII): " + rawFlags);
                  }
                  char combined = (char) (f1 << 8 | f2);
                  builder.append(combined);
              }

              char flags[] = new char[builder.length()];
              builder.getChars(0, builder.length(), flags, 0);
              return flags;
          }
      }

      static boolean hasFlag(char flags[], char flag) {
          return Arrays.binarySearch(flags, flag) >= 0;
      }

      CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
          reuse.setLength(0);

          for (int i = 0; i < input.length(); i++) {
              char ch = input.charAt(i);

              if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
                  continue;
              }

              if (ignoreCase && iconv == null) {
                  // if we have no input conversion mappings, do this on-the-fly
                  ch = caseFold(ch);
              }

              reuse.append(ch);
          }

          if (iconv != null) {
              try {
                  applyMappings(iconv, reuse);
              } catch (IOException bogus) {
                  throw new RuntimeException(bogus);
              }
              if (ignoreCase) {
                  for (int i = 0; i < reuse.length(); i++) {
                      reuse.setCharAt(i, caseFold(reuse.charAt(i)));
                  }
              }
          }

          return reuse;
      }

/** folds single character (according to LANG if present) */
char caseFold(char c) {
  if (alternateCasing) {
    if (c == 'I') {
      return '';
    } else if (c == '') {
      return 'i';
    } else {
      return Character.toLowerCase(c);
    }
  } else {
    return Character.toLowerCase(c);
  }
}

      // TODO: this could be more efficient!
      static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
          final FST.BytesReader bytesReader = fst.getBytesReader();
          final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
          final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();

          // temporary stuff
          final FST.Arc<CharsRef> arc = new FST.Arc<>();
          int longestMatch;
          CharsRef longestOutput;

          for (int i = 0; i < sb.length(); i++) {
              arc.copyFrom(firstArc);
              CharsRef output = NO_OUTPUT;
              longestMatch = -1;
              longestOutput = null;

              for (int j = i; j < sb.length(); j++) {
                  char ch = sb.charAt(j);
                  if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
                      break;
                  } else {
                      output = fst.outputs.add(output, arc.output);
                  }
                  if (arc.isFinal()) {
                      longestOutput = fst.outputs.add(output, arc.nextFinalOutput);
                      longestMatch = j;
                  }
              }

              if (longestMatch >= 0) {
                  sb.delete(i, longestMatch + 1);
                  sb.insert(i, longestOutput);
                  i += (longestOutput.length - 1);
              }
          }
      }

      /** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
      public boolean getIgnoreCase() {
          return ignoreCase;
      }

      private static Path DEFAULT_TEMP_DIR;

      /** Used by test framework */
      public static void setDefaultTempDir(Path tempDir) {
          DEFAULT_TEMP_DIR = tempDir;
      }

      /**
       * Returns the default temporary directory. By default, java.io.tmpdir. If not accessible
       * or not available, an IOException is thrown
       */
      synchronized static Path getDefaultTempDir() throws IOException {
          if (DEFAULT_TEMP_DIR == null) {
              // Lazy init
              String tempDirPath = System.getProperty("java.io.tmpdir");
              if (tempDirPath == null) {
                  throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
              }
              Path tempDirectory = Paths.get(tempDirPath);
              if (Files.isWritable(tempDirectory) == false) {
                  throw new IOException(
                          "Java's temporary folder not present or writeable?: " + tempDirectory.toAbsolutePath());
              }
              DEFAULT_TEMP_DIR = tempDirectory;
          }

          return DEFAULT_TEMP_DIR;
      }
  }