Example usage for org.apache.lucene.util.fst Builder add

List of usage examples for org.apache.lucene.util.fst Builder add

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Builder add.

Prototype

public void add(IntsRef input, T output) throws IOException 

Source Link

Document

Add the next input/output pair.

Usage

From source file:BuildFST.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {

    boolean numeric = true;
    boolean negative = false;
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        if (j != -1) {
            try {
                negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
            } catch (NumberFormatException nfe) {
                numeric = false;// www  . ja va 2s  .c o m
                break;
            }
        }
    }

    Outputs outputs;
    if (numeric) {
        if (negative) {
            throw new RuntimeException("can only handle numeric outputs >= 0");
        }
        outputs = PositiveIntOutputs.getSingleton();
    } else {
        outputs = ByteSequenceOutputs.getSingleton();
    }

    Pair<?>[] inputs = new Pair[args.length];
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        String input;
        Object output;
        if (j == -1) {
            output = outputs.getNoOutput();
            input = args[i];
        } else {
            input = args[i].substring(0, j);
            String outputString = args[i].substring(j + 1);
            if (numeric) {
                output = Long.parseLong(outputString);
            } else {
                output = new BytesRef(outputString);
            }
        }
        inputs[i] = new Pair(new BytesRef(input), output);
    }
    Arrays.sort(inputs);

    FST<?> fst;
    if (numeric) {
        Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (Long) pair.output);
        }
        fst = b.finish();
    } else {
        Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (BytesRef) pair.output);
        }
        fst = b.finish();
    }
    Util.toDot(fst, new PrintWriter(System.out), true, true);
}

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java

License:Apache License

private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
        @Override/*from   w  ww  . j  av a  2s. c o m*/
        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
        String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
        String[] readings = values[2].replaceAll("  *", " ").split(" ");
        String pos = values[3];

        if (segmentation.length != readings.length) {
            throw new RuntimeException("Illegal user dictionary entry " + values[0]
                    + " - the number of segmentations (" + segmentation.length + ")"
                    + " does not the match number of readings (" + readings.length + ")");
        }

        int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
        wordIdAndLength[0] = wordId;
        for (int i = 0; i < segmentation.length; i++) {
            wordIdAndLength[i + 1] = segmentation[i].length();
            data.add(readings[i] + INTERNAL_SEPARATOR + pos);
            wordId++;
        }
        // add mapping to FST
        String token = values[0];
        scratch.grow(token.length());
        scratch.setLength(token.length());
        for (int i = 0; i < token.length(); i++) {
            scratch.setIntAt(i, (int) token.charAt(i));
        }
        fstBuilder.add(scratch.get(), ord);
        segmentations.add(wordIdAndLength);
        ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }/*www.ja  v a2  s. c o m*/
    fst = fstBuilder.finish();
}

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
      IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
      Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
      IntsRefBuilder scratch = new IntsRefBuilder();
      for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
          Util.toUTF32(entry.getKey(), scratch);
          List<Integer> entries = entry.getValue();
          IntsRef output = new IntsRef(entries.size());
          for (Integer c : entries) {
              output.ints[output.length++] = c;
          }/*from   www. j av a2s  . c o  m*/
          builder.add(scratch.get(), output);
      }
      return builder.finish();
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
      Map<String, String> mappings = new TreeMap<>();

      for (int i = 0; i < num; i++) {
          String line = reader.readLine();
          String parts[] = line.split("\\s+");
          if (parts.length != 3) {
              throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
          }/* w ww.  ja  v a  2s. c o m*/
          if (mappings.put(parts[1], parts[2]) != null) {
              throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
          }
      }

      Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
      Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      for (Map.Entry<String, String> entry : mappings.entrySet()) {
          Util.toUTF16(entry.getKey(), scratchInts);
          builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
      }

      return builder.finish();
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *//  www.ja  v  a2s  .c  om
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      File unsorted = File.createTempFile("unsorted", "dat", tempDir);
      ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
      boolean success = false;
      try {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
          success = true;
      } finally {
          if (success) {
              IOUtils.close(writer);
          } else {
              IOUtils.closeWhileHandlingException(writer);
          }
      }
      File sorted = File.createTempFile("sorted", "dat", tempDir);

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      sorter.sort(unsorted, sorted);
      unsorted.delete();

      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      BytesRefBuilder scratchLine = new BytesRefBuilder();

      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently

      String currentEntry = null;
      IntsRefBuilder currentOrds = new IntsRefBuilder();

      String line;
      while (reader.read(scratchLine)) {
          line = scratchLine.get().utf8ToString();
          String entry;
          char wordForm[];
          int end;

          int flagSep = line.indexOf(FLAG_SEPARATOR);
          if (flagSep == -1) {
              wordForm = NOFLAGS;
              end = line.indexOf(MORPH_SEPARATOR);
              entry = line.substring(0, end);
          } else {
              end = line.indexOf(MORPH_SEPARATOR);
              String flagPart = line.substring(flagSep + 1, end);
              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              wordForm = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(wordForm);
              entry = line.substring(0, flagSep);
          }
          // we possibly have morphological data
          int stemExceptionID = 0;
          if (hasStemExceptions && end + 1 < line.length()) {
              String stemException = parseStemException(line.substring(end + 1));
              if (stemException != null) {
                  if (stemExceptionCount == stemExceptions.length) {
                      int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                              RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                      stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                  }
                  stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                  stemExceptions[stemExceptionCount++] = stemException;
              }
          }

          int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
          if (cmp < 0) {
              throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
          } else {
              encodeFlags(flagsScratch, wordForm);
              int ord = flagLookup.add(flagsScratch.get());
              if (ord < 0) {
                  // already exists in our hash
                  ord = (-ord) - 1;
              }
              // finalize current entry, and switch "current" if necessary
              if (cmp > 0 && currentEntry != null) {
                  Util.toUTF32(currentEntry, scratchInts);
                  words.add(scratchInts.get(), currentOrds.get());
              }
              // swap current
              if (cmp > 0 || currentEntry == null) {
                  currentEntry = entry;
                  currentOrds = new IntsRefBuilder(); // must be this way
              }
              if (hasStemExceptions) {
                  currentOrds.append(ord);
                  currentOrds.append(stemExceptionID);
              } else {
                  currentOrds.append(ord);
              }
          }
      }

      // finalize last entry
      Util.toUTF32(currentEntry, scratchInts);
      words.add(scratchInts.get(), currentOrds.get());

      reader.close();
      sorted.delete();
  }

From source file:examples.fst.FstTest.java

public static void main(String[] args) throws IOException {
    // Input values (keys). These must be provided to Builder in Unicode sorted order!
    String inputValues[] = { "cat", "dog", "dogs" };
    long outputValues[] = { 5, 7, 12 };

    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (int i = 0; i < inputValues.length; i++) {
        scratchBytes.copyChars(inputValues[i]);
        builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]);
    }//from w w  w  .  ja v a2 s.  co m
    FST<Long> fst = builder.finish();

    Long value = Util.get(fst, new BytesRef("dog"));
    System.out.println(value); // 7

    // Only works because outputs are also in sorted order
    IntsRef key = Util.getByOutput(fst, 12);
    System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs

}

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   */*from w ww . j  a va 2  s.  c  o  m*/
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat");
      try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
      }
      Path sorted = Files.createTempFile(tempDir, "sorted", "dat");

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      boolean success = false;
      try {
          sorter.sort(unsorted, sorted);
          success = true;
      } finally {
          if (success) {
              Files.delete(unsorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(unsorted);
          }
      }

      boolean success2 = false;
      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      try {
          BytesRefBuilder scratchLine = new BytesRefBuilder();

          // TODO: the flags themselves can be double-chars (long) or also numeric
          // either way the trick is to encode them as char... but they must be parsed differently

          String currentEntry = null;
          IntsRefBuilder currentOrds = new IntsRefBuilder();

          String line;
          while (reader.read(scratchLine)) {
              line = scratchLine.get().utf8ToString();
              String entry;
              char wordForm[];
              int end;

              int flagSep = line.indexOf(FLAG_SEPARATOR);
              if (flagSep == -1) {
                  wordForm = NOFLAGS;
                  end = line.indexOf(MORPH_SEPARATOR);
                  entry = line.substring(0, end);
              } else {
                  end = line.indexOf(MORPH_SEPARATOR);
                  String flagPart = line.substring(flagSep + 1, end);
                  if (aliasCount > 0) {
                      flagPart = getAliasValue(Integer.parseInt(flagPart));
                  }

                  wordForm = flagParsingStrategy.parseFlags(flagPart);
                  Arrays.sort(wordForm);
                  entry = line.substring(0, flagSep);
              }
              // we possibly have morphological data
              int stemExceptionID = 0;
              if (hasStemExceptions && end + 1 < line.length()) {
                  String stemException = parseStemException(line.substring(end + 1));
                  if (stemException != null) {
                      if (stemExceptionCount == stemExceptions.length) {
                          int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                                  RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                          stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                      }
                      stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                      stemExceptions[stemExceptionCount++] = stemException;
                  }
              }

              int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
              if (cmp < 0) {
                  throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
              } else {
                  encodeFlags(flagsScratch, wordForm);
                  int ord = flagLookup.add(flagsScratch.get());
                  if (ord < 0) {
                      // already exists in our hash
                      ord = (-ord) - 1;
                  }
                  // finalize current entry, and switch "current" if necessary
                  if (cmp > 0 && currentEntry != null) {
                      Util.toUTF32(currentEntry, scratchInts);
                      words.add(scratchInts.get(), currentOrds.get());
                  }
                  // swap current
                  if (cmp > 0 || currentEntry == null) {
                      currentEntry = entry;
                      currentOrds = new IntsRefBuilder(); // must be this way
                  }
                  if (hasStemExceptions) {
                      currentOrds.append(ord);
                      currentOrds.append(stemExceptionID);
                  } else {
                      currentOrds.append(ord);
                  }
              }
          }

          // finalize last entry
          Util.toUTF32(currentEntry, scratchInts);
          words.add(scratchInts.get(), currentOrds.get());
          success2 = true;
      } finally {
          IOUtils.closeWhileHandlingException(reader);
          if (success2) {
              Files.delete(sorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(sorted);
          }
      }
  }

From source file:org.elasticsearch.index.fielddata.plain.FSTBytesIndexFieldData.java

License:Apache License

@Override
public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
    AtomicReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    FSTBytesAtomicFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker());
    if (terms == null) {
        data = FSTBytesAtomicFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.getMemorySizeInBytes());
        return data;
    }/*  w ww. j a v a  2 s  .  co  m*/
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>(
            INPUT_TYPE.BYTE1, outputs);
    final IntsRef scratch = new IntsRef();

    final long numTerms;
    if (regex == null && frequency == null) {
        numTerms = terms.size();
    } else {
        numTerms = -1;
    }
    final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat(
            "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
    OrdinalsBuilder builder = new OrdinalsBuilder(numTerms, reader.maxDoc(), acceptableTransientOverheadRatio);
    boolean success = false;
    try {

        // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support
        // empty strings twice. ie. them merge fails for long output.
        TermsEnum termsEnum = filter(terms, reader);
        DocsEnum docsEnum = null;
        for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
            final long termOrd = builder.nextOrdinal();
            assert termOrd > 0;
            fstBuilder.add(Util.toIntsRef(term, scratch), (long) termOrd);
            docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
            for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                builder.addDoc(docId);
            }
        }

        FST<Long> fst = fstBuilder.finish();

        final Ordinals ordinals = builder.build(fieldDataType.getSettings());

        data = new FSTBytesAtomicFieldData(fst, ordinals);
        success = true;
        return data;
    } finally {
        if (success) {
            estimator.afterLoad(null, data.getMemorySizeInBytes());
        }
        builder.close();
    }
}

From source file:stemmer.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

    IntsRef scratch = new IntsRef();
    for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) {
        Util.toUTF32(entry.getKey(), scratch);
        List<Character> entries = entry.getValue();
        IntsRef output = new IntsRef(entries.size());
        for (Character c : entries) {
            output.ints[output.length++] = c;
        }/*from   w  ww .jav  a  2s  .  co m*/
        builder.add(scratch, output);
    }
    return builder.finish();
}