Example usage for org.apache.lucene.util.fst Builder finish

List of usage examples for org.apache.lucene.util.fst Builder finish

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Builder finish.

Prototype

public FST<T> finish() throws IOException 

Source Link

Document

Returns final FST.

Usage

From source file:BuildFST.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {

    boolean numeric = true;
    boolean negative = false;
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        if (j != -1) {
            try {
                negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
            } catch (NumberFormatException nfe) {
                numeric = false;/* ww w. j a va2  s  .c o  m*/
                break;
            }
        }
    }

    Outputs outputs;
    if (numeric) {
        if (negative) {
            throw new RuntimeException("can only handle numeric outputs >= 0");
        }
        outputs = PositiveIntOutputs.getSingleton();
    } else {
        outputs = ByteSequenceOutputs.getSingleton();
    }

    Pair<?>[] inputs = new Pair[args.length];
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        String input;
        Object output;
        if (j == -1) {
            output = outputs.getNoOutput();
            input = args[i];
        } else {
            input = args[i].substring(0, j);
            String outputString = args[i].substring(j + 1);
            if (numeric) {
                output = Long.parseLong(outputString);
            } else {
                output = new BytesRef(outputString);
            }
        }
        inputs[i] = new Pair(new BytesRef(input), output);
    }
    Arrays.sort(inputs);

    FST<?> fst;
    if (numeric) {
        Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (Long) pair.output);
        }
        fst = b.finish();
    } else {
        Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (BytesRef) pair.output);
        }
        fst = b.finish();
    }
    Util.toDot(fst, new PrintWriter(System.out), true, true);
}

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java

License:Apache License

private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
        @Override/* www .j  av  a  2 s  . c om*/
        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
        String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
        String[] readings = values[2].replaceAll("  *", " ").split(" ");
        String pos = values[3];

        if (segmentation.length != readings.length) {
            throw new RuntimeException("Illegal user dictionary entry " + values[0]
                    + " - the number of segmentations (" + segmentation.length + ")"
                    + " does not the match number of readings (" + readings.length + ")");
        }

        int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
        wordIdAndLength[0] = wordId;
        for (int i = 0; i < segmentation.length; i++) {
            wordIdAndLength[i + 1] = segmentation[i].length();
            data.add(readings[i] + INTERNAL_SEPARATOR + pos);
            wordId++;
        }
        // add mapping to FST
        String token = values[0];
        scratch.grow(token.length());
        scratch.setLength(token.length());
        for (int i = 0; i < token.length(); i++) {
            scratch.setIntAt(i, (int) token.charAt(i));
        }
        fstBuilder.add(scratch.get(), ord);
        segmentations.add(wordIdAndLength);
        ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }/*  w w  w  . j a v  a 2 s .co m*/
    fst = fstBuilder.finish();
}

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files./* w w w  . j  av a2s . c o  m*/
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase)
          throws IOException, ParseException {
      this.ignoreCase = ignoreCase;
      this.needsInputCleaning = ignoreCase;
      this.needsOutputCleaning = false; // set if we have an OCONV
      flagLookup.add(new BytesRef()); // no flags -> ord 0

      File aff = File.createTempFile("affix", "aff", tempDir);
      OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
      InputStream aff1 = null;
      InputStream aff2 = null;
      try {
          // copy contents of affix stream to temp file
          final byte[] buffer = new byte[1024 * 8];
          int len;
          while ((len = affix.read(buffer)) > 0) {
              out.write(buffer, 0, len);
          }
          out.close();

          // pass 1: get encoding
          aff1 = new BufferedInputStream(new FileInputStream(aff));
          String encoding = getDictionaryEncoding(aff1);

          // pass 2: parse affixes
          CharsetDecoder decoder = getJavaEncoding(encoding);
          aff2 = new BufferedInputStream(new FileInputStream(aff));
          readAffixFile(aff2, decoder);

          // read dictionary entries
          IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
          Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
          readDictionaryFiles(dictionaries, decoder, b);
          words = b.finish();
          aliases = null; // no longer needed
          morphAliases = null; // no longer needed
      } finally {
          IOUtils.closeWhileHandlingException(out, aff1, aff2);
          aff.delete();
      }
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
      IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
      Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
      IntsRefBuilder scratch = new IntsRefBuilder();
      for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
          Util.toUTF32(entry.getKey(), scratch);
          List<Integer> entries = entry.getValue();
          IntsRef output = new IntsRef(entries.size());
          for (Integer c : entries) {
              output.ints[output.length++] = c;
          }/*from  w w  w. jav  a 2  s  . co  m*/
          builder.add(scratch.get(), output);
      }
      return builder.finish();
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
      Map<String, String> mappings = new TreeMap<>();

      for (int i = 0; i < num; i++) {
          String line = reader.readLine();
          String parts[] = line.split("\\s+");
          if (parts.length != 3) {
              throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
          }/*from w  w  w.  jav  a 2 s  .  c  o  m*/
          if (mappings.put(parts[1], parts[2]) != null) {
              throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
          }
      }

      Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
      Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      for (Map.Entry<String, String> entry : mappings.entrySet()) {
          Util.toUTF16(entry.getKey(), scratchInts);
          builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
      }

      return builder.finish();
  }

From source file:examples.fst.FstTest.java

public static void main(String[] args) throws IOException {
    // Input values (keys). These must be provided to Builder in Unicode sorted order!
    String inputValues[] = { "cat", "dog", "dogs" };
    long outputValues[] = { 5, 7, 12 };

    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (int i = 0; i < inputValues.length; i++) {
        scratchBytes.copyChars(inputValues[i]);
        builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]);
    }//from  ww w  . jav  a2 s  .  c om
    FST<Long> fst = builder.finish();

    Long value = Util.get(fst, new BytesRef("dog"));
    System.out.println(value); // 7

    // Only works because outputs are also in sorted order
    IntsRef key = Util.getByOutput(fst, 12);
    System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs

}

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files./* w w w .j  av  a2  s.c o m*/
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase)
          throws IOException, ParseException {
      this.ignoreCase = ignoreCase;
      this.needsInputCleaning = ignoreCase;
      this.needsOutputCleaning = false; // set if we have an OCONV
      flagLookup.add(new BytesRef()); // no flags -> ord 0

      Path aff = Files.createTempFile(tempDir, "affix", "aff");
      OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
      InputStream aff1 = null;
      InputStream aff2 = null;
      boolean success = false;
      try {
          // copy contents of affix stream to temp file
          final byte[] buffer = new byte[1024 * 8];
          int len;
          while ((len = affix.read(buffer)) > 0) {
              out.write(buffer, 0, len);
          }
          out.close();

          // pass 1: get encoding
          aff1 = new BufferedInputStream(Files.newInputStream(aff));
          String encoding = getDictionaryEncoding(aff1);

          // pass 2: parse affixes
          CharsetDecoder decoder = getJavaEncoding(encoding);
          aff2 = new BufferedInputStream(Files.newInputStream(aff));
          readAffixFile(aff2, decoder);

          // read dictionary entries
          IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
          Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
          readDictionaryFiles(dictionaries, decoder, b);
          words = b.finish();
          aliases = null; // no longer needed
          morphAliases = null; // no longer needed
          success = true;
      } finally {
          IOUtils.closeWhileHandlingException(out, aff1, aff2);
          if (success) {
              Files.delete(aff);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(aff);
          }
      }
  }

From source file:org.elasticsearch.index.fielddata.plain.FSTBytesIndexFieldData.java

License:Apache License

@Override
public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
    AtomicReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    FSTBytesAtomicFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker());
    if (terms == null) {
        data = FSTBytesAtomicFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.getMemorySizeInBytes());
        return data;
    }/*from   ww  w  . j  av  a2s  .  com*/
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>(
            INPUT_TYPE.BYTE1, outputs);
    final IntsRef scratch = new IntsRef();

    final long numTerms;
    if (regex == null && frequency == null) {
        numTerms = terms.size();
    } else {
        numTerms = -1;
    }
    final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat(
            "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
    OrdinalsBuilder builder = new OrdinalsBuilder(numTerms, reader.maxDoc(), acceptableTransientOverheadRatio);
    boolean success = false;
    try {

        // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support
        // empty strings twice. ie. them merge fails for long output.
        TermsEnum termsEnum = filter(terms, reader);
        DocsEnum docsEnum = null;
        for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
            final long termOrd = builder.nextOrdinal();
            assert termOrd > 0;
            fstBuilder.add(Util.toIntsRef(term, scratch), (long) termOrd);
            docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
            for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                builder.addDoc(docId);
            }
        }

        FST<Long> fst = fstBuilder.finish();

        final Ordinals ordinals = builder.build(fieldDataType.getSettings());

        data = new FSTBytesAtomicFieldData(fst, ordinals);
        success = true;
        return data;
    } finally {
        if (success) {
            estimator.afterLoad(null, data.getMemorySizeInBytes());
        }
        builder.close();
    }
}

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
 * and dictionary files.//w  w w.  j a v a 2s.c  om
 * You have to close the provided InputStreams yourself.
 *
 * @param affix InputStream for reading the hunspell affix file (won't be closed).
 * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
 * @throws IOException Can be thrown while reading from the InputStreams
 * @throws ParseException Can be thrown if the content of the files does not meet expected formats
 */
public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase)
        throws IOException, ParseException {
    this.ignoreCase = ignoreCase;
    this.needsInputCleaning = ignoreCase;
    this.needsOutputCleaning = false; // set if we have an OCONV
    flagLookup.add(new BytesRef()); // no flags -> ord 0

    File aff = File.createTempFile("affix", "aff", tempDir);
    OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
    InputStream aff1 = null;
    InputStream aff2 = null;
    try {
        // copy contents of affix stream to temp file
        final byte[] buffer = new byte[1024 * 8];
        int len;
        while ((len = affix.read(buffer)) > 0) {
            out.write(buffer, 0, len);
        }
        out.close();

        // pass 1: get encoding
        aff1 = new BufferedInputStream(new FileInputStream(aff));
        String encoding = getDictionaryEncoding(aff1);

        // pass 2: parse affixes
        CharsetDecoder decoder = getJavaEncoding(encoding);
        aff2 = new BufferedInputStream(new FileInputStream(aff));
        readAffixFile(aff2, decoder);

        // read dictionary entries
        IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
        Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
        readDictionaryFiles(dictionaries, decoder, b);
        words = b.finish();
        aliases = null; // no longer needed
    } finally {
        IOUtils.closeWhileHandlingException(out, aff1, aff2);
        aff.delete();
    }
}