List of usage examples for org.apache.lucene.util IntsRefBuilder IntsRefBuilder
public IntsRefBuilder()
From source file:BuildFST.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public static void main(String[] args) throws IOException { boolean numeric = true; boolean negative = false; for (int i = 0; i < args.length; i++) { int j = args[i].lastIndexOf('/'); if (j != -1) { try { negative |= Long.parseLong(args[i].substring(j + 1)) < 0; } catch (NumberFormatException nfe) { numeric = false;//from w w w. ja v a 2 s . co m break; } } } Outputs outputs; if (numeric) { if (negative) { throw new RuntimeException("can only handle numeric outputs >= 0"); } outputs = PositiveIntOutputs.getSingleton(); } else { outputs = ByteSequenceOutputs.getSingleton(); } Pair<?>[] inputs = new Pair[args.length]; for (int i = 0; i < args.length; i++) { int j = args[i].lastIndexOf('/'); String input; Object output; if (j == -1) { output = outputs.getNoOutput(); input = args[i]; } else { input = args[i].substring(0, j); String outputString = args[i].substring(j + 1); if (numeric) { output = Long.parseLong(outputString); } else { output = new BytesRef(outputString); } } inputs[i] = new Pair(new BytesRef(input), output); } Arrays.sort(inputs); FST<?> fst; if (numeric) { Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); for (Pair pair : inputs) { IntsRefBuilder intsBuilder = new IntsRefBuilder(); Util.toIntsRef(pair.input, intsBuilder); b.add(intsBuilder.get(), (Long) pair.output); } fst = b.finish(); } else { Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs); for (Pair pair : inputs) { IntsRefBuilder intsBuilder = new IntsRefBuilder(); Util.toIntsRef(pair.input, intsBuilder); b.add(intsBuilder.get(), (BytesRef) pair.output); } fst = b.finish(); } Util.toDot(fst, new PrintWriter(System.out), true, true); }
From source file:com.github.cstoku.neologd.unidic.lucene.analysis.ja.dict.UserDictionary.java
License:Apache License
private UserDictionary(List<String[]> featureEntries) throws IOException { int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if it's needed/useful? Collections.sort(featureEntries, new Comparator<String[]>() { @Override/* w ww .j a v a 2 s . co m*/ public int compare(String[] left, String[] right) { return left[0].compareTo(right[0]); } }); List<String> data = new ArrayList<>(featureEntries.size()); List<int[]> segmentations = new ArrayList<>(featureEntries.size()); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (String[] values : featureEntries) { String[] segmentation = values[1].replaceAll(" *", " ").split(" "); String[] readings = values[2].replaceAll(" *", " ").split(" "); String pos = values[3]; if (segmentation.length != readings.length) { throw new RuntimeException("Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.length + ")" + " does not the match number of readings (" + readings.length + ")"); } int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.length; i++) { wordIdAndLength[i + 1] = segmentation[i].length(); data.add(readings[i] + INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST String token = values[0]; scratch.grow(token.length()); scratch.setLength(token.length()); for (int i = 0; i < token.length(); i++) { scratch.setIntAt(i, (int) token.charAt(i)); } fstBuilder.add(scratch.get(), ord); segmentations.add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.finish(), false); this.data = data.toArray(new String[data.size()]); this.segmentations = segmentations.toArray(new int[segmentations.size()][]); }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateFST(SortedMap<String, Double> weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, Double> entry : weights.entrySet()) { scratchBytes.copyChars(entry.getKey()); fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue()); }/*w ww . j a v a 2s .com*/ fst = fstBuilder.finish(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; }/*from ww w. ja v a2 s . co m*/ builder.add(scratch.get(), output); } return builder.finish(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String, String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); }//from ww w. j a v a 2 s . c om if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map */*from w w w .ja v a2s . c o m*/ * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); reader.close(); sorted.delete(); }
From source file:examples.fst.FstTest.java
public static void main(String[] args) throws IOException { // Input values (keys). These must be provided to Builder in Unicode sorted order! String inputValues[] = { "cat", "dog", "dogs" }; long outputValues[] = { 5, 7, 12 }; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (int i = 0; i < inputValues.length; i++) { scratchBytes.copyChars(inputValues[i]); builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]); }//from w w w. j av a 2 s . c o m FST<Long> fst = builder.finish(); Long value = Util.get(fst, new BytesRef("dog")); System.out.println(value); // 7 // Only works because outputs are also in sorted order IntsRef key = Util.getByOutput(fst, 12); System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map *// w w w . j a v a2s . c o m * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat"); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } } Path sorted = Files.createTempFile(tempDir, "sorted", "dat"); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); boolean success = false; try { sorter.sort(unsorted, sorted); success = true; } finally { if (success) { Files.delete(unsorted); } else { IOUtils.deleteFilesIgnoringExceptions(unsorted); } } boolean success2 = false; ByteSequencesReader reader = new ByteSequencesReader(sorted); try { BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); success2 = true; } finally { IOUtils.closeWhileHandlingException(reader); if (success2) { Files.delete(sorted); } else { IOUtils.deleteFilesIgnoringExceptions(sorted); } } }