Example usage for org.apache.lucene.util.fst Builder Builder

List of usage examples for org.apache.lucene.util.fst Builder Builder

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Builder Builder.

Prototype

public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) 

Source Link

Document

Instantiates an FST/FSA builder without any pruning.

Usage

From source file:BuildFST.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {

    boolean numeric = true;
    boolean negative = false;
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        if (j != -1) {
            try {
                negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
            } catch (NumberFormatException nfe) {
                numeric = false;/* w w w  .j a v  a 2  s  . c  o  m*/
                break;
            }
        }
    }

    Outputs outputs;
    if (numeric) {
        if (negative) {
            throw new RuntimeException("can only handle numeric outputs >= 0");
        }
        outputs = PositiveIntOutputs.getSingleton();
    } else {
        outputs = ByteSequenceOutputs.getSingleton();
    }

    Pair<?>[] inputs = new Pair[args.length];
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        String input;
        Object output;
        if (j == -1) {
            output = outputs.getNoOutput();
            input = args[i];
        } else {
            input = args[i].substring(0, j);
            String outputString = args[i].substring(j + 1);
            if (numeric) {
                output = Long.parseLong(outputString);
            } else {
                output = new BytesRef(outputString);
            }
        }
        inputs[i] = new Pair(new BytesRef(input), output);
    }
    Arrays.sort(inputs);

    FST<?> fst;
    if (numeric) {
        Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (Long) pair.output);
        }
        fst = b.finish();
    } else {
        Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (BytesRef) pair.output);
        }
        fst = b.finish();
    }
    Util.toDot(fst, new PrintWriter(System.out), true, true);
}

From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java

License:Apache License

private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
        @Override/*from   w ww . ja  va  2 s. com*/
        public int compare(String[] left, String[] right) {
            return left[0].compareTo(right[0]);
        }
    });

    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;

    for (String[] values : featureEntries) {
        String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
        String[] readings = values[2].replaceAll("  *", " ").split(" ");
        String pos = values[3];

        if (segmentation.length != readings.length) {
            throw new RuntimeException("Illegal user dictionary entry " + values[0]
                    + " - the number of segmentations (" + segmentation.length + ")"
                    + " does not the match number of readings (" + readings.length + ")");
        }

        int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
        wordIdAndLength[0] = wordId;
        for (int i = 0; i < segmentation.length; i++) {
            wordIdAndLength[i + 1] = segmentation[i].length();
            data.add(readings[i] + INTERNAL_SEPARATOR + pos);
            wordId++;
        }
        // add mapping to FST
        String token = values[0];
        scratch.grow(token.length());
        scratch.setLength(token.length());
        for (int i = 0; i < token.length(); i++) {
            scratch.setIntAt(i, (int) token.charAt(i));
        }
        fstBuilder.add(scratch.get(), ord);
        segmentations.add(wordIdAndLength);
        ord++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }//  ww  w. java 2  s .c  o m
    fst = fstBuilder.finish();
}

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files./*from ww  w  . j av a2 s.c o  m*/
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase)
          throws IOException, ParseException {
      this.ignoreCase = ignoreCase;
      this.needsInputCleaning = ignoreCase;
      this.needsOutputCleaning = false; // set if we have an OCONV
      flagLookup.add(new BytesRef()); // no flags -> ord 0

      File aff = File.createTempFile("affix", "aff", tempDir);
      OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
      InputStream aff1 = null;
      InputStream aff2 = null;
      try {
          // copy contents of affix stream to temp file
          final byte[] buffer = new byte[1024 * 8];
          int len;
          while ((len = affix.read(buffer)) > 0) {
              out.write(buffer, 0, len);
          }
          out.close();

          // pass 1: get encoding
          aff1 = new BufferedInputStream(new FileInputStream(aff));
          String encoding = getDictionaryEncoding(aff1);

          // pass 2: parse affixes
          CharsetDecoder decoder = getJavaEncoding(encoding);
          aff2 = new BufferedInputStream(new FileInputStream(aff));
          readAffixFile(aff2, decoder);

          // read dictionary entries
          IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
          Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
          readDictionaryFiles(dictionaries, decoder, b);
          words = b.finish();
          aliases = null; // no longer needed
          morphAliases = null; // no longer needed
      } finally {
          IOUtils.closeWhileHandlingException(out, aff1, aff2);
          aff.delete();
      }
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
      IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
      Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
      IntsRefBuilder scratch = new IntsRefBuilder();
      for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
          Util.toUTF32(entry.getKey(), scratch);
          List<Integer> entries = entry.getValue();
          IntsRef output = new IntsRef(entries.size());
          for (Integer c : entries) {
              output.ints[output.length++] = c;
          }/* ww  w . j av a 2  s  .c  o m*/
          builder.add(scratch.get(), output);
      }
      return builder.finish();
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
      Map<String, String> mappings = new TreeMap<>();

      for (int i = 0; i < num; i++) {
          String line = reader.readLine();
          String parts[] = line.split("\\s+");
          if (parts.length != 3) {
              throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
          }//from ww  w .j  av  a2s.  co m
          if (mappings.put(parts[1], parts[2]) != null) {
              throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
          }
      }

      Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
      Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      for (Map.Entry<String, String> entry : mappings.entrySet()) {
          Util.toUTF16(entry.getKey(), scratchInts);
          builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
      }

      return builder.finish();
  }

From source file:examples.fst.FstTest.java

public static void main(String[] args) throws IOException {
    // Input values (keys). These must be provided to Builder in Unicode sorted order!
    String inputValues[] = { "cat", "dog", "dogs" };
    long outputValues[] = { 5, 7, 12 };

    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (int i = 0; i < inputValues.length; i++) {
        scratchBytes.copyChars(inputValues[i]);
        builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]);
    }//from   w  w w. j a v  a  2s .c  o m
    FST<Long> fst = builder.finish();

    Long value = Util.get(fst, new BytesRef("dog"));
    System.out.println(value); // 7

    // Only works because outputs are also in sorted order
    IntsRef key = Util.getByOutput(fst, 12);
    System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs

}

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files.// w  w w.j  ava2  s . com
   * You have to close the provided InputStreams yourself.
   *
   * @param affix InputStream for reading the hunspell affix file (won't be closed).
   * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase)
          throws IOException, ParseException {
      this.ignoreCase = ignoreCase;
      this.needsInputCleaning = ignoreCase;
      this.needsOutputCleaning = false; // set if we have an OCONV
      flagLookup.add(new BytesRef()); // no flags -> ord 0

      Path aff = Files.createTempFile(tempDir, "affix", "aff");
      OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
      InputStream aff1 = null;
      InputStream aff2 = null;
      boolean success = false;
      try {
          // copy contents of affix stream to temp file
          final byte[] buffer = new byte[1024 * 8];
          int len;
          while ((len = affix.read(buffer)) > 0) {
              out.write(buffer, 0, len);
          }
          out.close();

          // pass 1: get encoding
          aff1 = new BufferedInputStream(Files.newInputStream(aff));
          String encoding = getDictionaryEncoding(aff1);

          // pass 2: parse affixes
          CharsetDecoder decoder = getJavaEncoding(encoding);
          aff2 = new BufferedInputStream(Files.newInputStream(aff));
          readAffixFile(aff2, decoder);

          // read dictionary entries
          IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
          Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
          readDictionaryFiles(dictionaries, decoder, b);
          words = b.finish();
          aliases = null; // no longer needed
          morphAliases = null; // no longer needed
          success = true;
      } finally {
          IOUtils.closeWhileHandlingException(out, aff1, aff2);
          if (success) {
              Files.delete(aff);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(aff);
          }
      }
  }

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
 * and dictionary files.//from w w w .j a v  a 2  s. co  m
 * You have to close the provided InputStreams yourself.
 *
 * @param affix InputStream for reading the hunspell affix file (won't be closed).
 * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
 * @throws IOException Can be thrown while reading from the InputStreams
 * @throws ParseException Can be thrown if the content of the files does not meet expected formats
 */
public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase)
        throws IOException, ParseException {
    this.ignoreCase = ignoreCase;
    this.needsInputCleaning = ignoreCase;
    this.needsOutputCleaning = false; // set if we have an OCONV
    flagLookup.add(new BytesRef()); // no flags -> ord 0

    File aff = File.createTempFile("affix", "aff", tempDir);
    OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
    InputStream aff1 = null;
    InputStream aff2 = null;
    try {
        // copy contents of affix stream to temp file
        final byte[] buffer = new byte[1024 * 8];
        int len;
        while ((len = affix.read(buffer)) > 0) {
            out.write(buffer, 0, len);
        }
        out.close();

        // pass 1: get encoding
        aff1 = new BufferedInputStream(new FileInputStream(aff));
        String encoding = getDictionaryEncoding(aff1);

        // pass 2: parse affixes
        CharsetDecoder decoder = getJavaEncoding(encoding);
        aff2 = new BufferedInputStream(new FileInputStream(aff));
        readAffixFile(aff2, decoder);

        // read dictionary entries
        IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
        Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o);
        readDictionaryFiles(dictionaries, decoder, b);
        words = b.finish();
        aliases = null; // no longer needed
    } finally {
        IOUtils.closeWhileHandlingException(out, aff1, aff2);
        aff.delete();
    }
}

From source file:stemmer.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

    IntsRef scratch = new IntsRef();
    for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) {
        Util.toUTF32(entry.getKey(), scratch);
        List<Character> entries = entry.getValue();
        IntsRef output = new IntsRef(entries.size());
        for (Character c : entries) {
            output.ints[output.length++] = c;
        }// www .  j a  v  a 2  s.  c o m
        builder.add(scratch, output);
    }
    return builder.finish();
}