List of usage examples for org.apache.lucene.util.fst Builder Builder
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs)
From source file:BuildFST.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {
boolean numeric = true;
boolean negative = false;
for (int i = 0; i < args.length; i++) {
int j = args[i].lastIndexOf('/');
if (j != -1) {
try {
negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
} catch (NumberFormatException nfe) {
numeric = false;/* w w w .j a v a 2 s . c o m*/
break;
}
}
}
Outputs outputs;
if (numeric) {
if (negative) {
throw new RuntimeException("can only handle numeric outputs >= 0");
}
outputs = PositiveIntOutputs.getSingleton();
} else {
outputs = ByteSequenceOutputs.getSingleton();
}
Pair<?>[] inputs = new Pair[args.length];
for (int i = 0; i < args.length; i++) {
int j = args[i].lastIndexOf('/');
String input;
Object output;
if (j == -1) {
output = outputs.getNoOutput();
input = args[i];
} else {
input = args[i].substring(0, j);
String outputString = args[i].substring(j + 1);
if (numeric) {
output = Long.parseLong(outputString);
} else {
output = new BytesRef(outputString);
}
}
inputs[i] = new Pair(new BytesRef(input), output);
}
Arrays.sort(inputs);
FST<?> fst;
if (numeric) {
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
for (Pair pair : inputs) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(pair.input, intsBuilder);
b.add(intsBuilder.get(), (Long) pair.output);
}
fst = b.finish();
} else {
Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
for (Pair pair : inputs) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(pair.input, intsBuilder);
b.add(intsBuilder.get(), (BytesRef) pair.output);
}
fst = b.finish();
}
Util.toDot(fst, new PrintWriter(System.out), true, true);
}
From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java
License:Apache License
private UserDictionary(List<String[]> featureEntries) throws IOException { int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if it's needed/useful? Collections.sort(featureEntries, new Comparator<String[]>() { @Override/*from w ww . ja va 2 s. com*/ public int compare(String[] left, String[] right) { return left[0].compareTo(right[0]); } }); List<String> data = new ArrayList<>(featureEntries.size()); List<int[]> segmentations = new ArrayList<>(featureEntries.size()); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (String[] values : featureEntries) { String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; if (segmentation.length != readings.length) { throw new RuntimeException("Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.length + ")" + " does not the match number of readings (" + readings.length + ")"); } int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.length; i++) { wordIdAndLength[i + 1] = segmentation[i].length(); data.add(readings[i] + INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST String token = values[0]; scratch.grow(token.length()); scratch.setLength(token.length()); for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, (int) token.charAt(i)); } fstBuilder.add(scratch.get(), ord); segmentations.add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.finish(), false); this.data = data.toArray(new String[data.size()]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateFST(SortedMap<String, Double> weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, Double> entry : weights.entrySet()) { scratchBytes.copyChars(entry.getKey()); fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue()); }// ww w. java 2 s .c o m fst = fstBuilder.finish(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files./*from ww w . j av a2 s.c o m*/ * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 File aff = File.createTempFile("affix", "aff", tempDir); OutputStream out = new BufferedOutputStream(new FileOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; try { // copy contents of affix stream to temp file final byte[] buffer = new byte[1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(new FileInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(new FileInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); aff.delete(); } }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; }/* ww w . j av a 2 s .c o m*/ builder.add(scratch.get(), output); } return builder.finish(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String, String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); }//from ww w .j av a2s. co m if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
From source file:examples.fst.FstTest.java
public static void main(String[] args) throws IOException { // Input values (keys). These must be provided to Builder in Unicode sorted order! String inputValues[] = { "cat", "dog", "dogs" }; long outputValues[] = { 5, 7, 12 }; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (int i = 0; i < inputValues.length; i++) { scratchBytes.copyChars(inputValues[i]); builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]); }//from w w w. j a v a 2s .c o m FST<Long> fst = builder.finish(); Long value = Util.get(fst, new BytesRef("dog")); System.out.println(value); // 7 // Only works because outputs are also in sorted order IntsRef key = Util.getByOutput(fst, 12); System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files.// w w w.j ava2 s . com * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 Path aff = Files.createTempFile(tempDir, "affix", "aff"); OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; boolean success = false; try { // copy contents of affix stream to temp file final byte[] buffer = new byte[1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(Files.newInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(Files.newInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed success = true; } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); if (success) { Files.delete(aff); } else { IOUtils.deleteFilesIgnoringExceptions(aff); } } }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files.//from w w w .j a v a 2 s. co m * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 File aff = File.createTempFile("affix", "aff", tempDir); OutputStream out = new BufferedOutputStream(new FileOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; try { // copy contents of affix stream to temp file final byte[] buffer = new byte[1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(new FileInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(new FileInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); aff.delete(); } }
From source file:stemmer.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRef scratch = new IntsRef(); for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Character> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Character c : entries) { output.ints[output.length++] = c; }// www . j a v a 2 s. c o m builder.add(scratch, output); } return builder.finish(); }