List of usage examples for org.apache.lucene.util.fst Util toUTF32
public static IntsRef toUTF32(CharSequence s, IntsRefBuilder scratch)
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; }//from ww w . ja v a2s. co m builder.add(scratch.get(), output); } return builder.finish(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map *// w w w . j a v a2 s. c o m * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); reader.close(); sorted.delete(); }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map */*from w w w .java 2 s. c o m*/ * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat"); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } } Path sorted = Files.createTempFile(tempDir, "sorted", "dat"); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); boolean success = false; try { sorter.sort(unsorted, sorted); success = true; } finally { if (success) { Files.delete(unsorted); } else { IOUtils.deleteFilesIgnoringExceptions(unsorted); } } boolean success2 = false; ByteSequencesReader reader = new ByteSequencesReader(sorted); try { BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); success2 = true; } finally { IOUtils.closeWhileHandlingException(reader); if (success2) { Files.delete(sorted); } else { IOUtils.deleteFilesIgnoringExceptions(sorted); } } }
From source file:stemmer.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRef scratch = new IntsRef(); for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Character> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Character c : entries) { output.ints[output.length++] = c; }/*from w ww . ja v a 2s. c o m*/ builder.add(scratch, output); } return builder.finish(); }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map * * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */// w ww .j av a 2 s .c o m private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { line = unescapeEntry(line); if (needsInputCleaning) { int flagSep = line.lastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRef scratchLine = new BytesRef(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRef currentOrds = new IntsRef(); String line; while (reader.read(scratchLine)) { line = scratchLine.utf8ToString(); String entry; char wordForm[]; int flagSep = line.lastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; entry = line; } else { // note, there can be comments (morph description) after a flag. // we should really look for any whitespace: currently just tab and space int end = line.indexOf('\t', flagSep); if (end == -1) end = line.length(); int end2 = line.indexOf(' ', flagSep); if (end2 == -1) end2 = line.length(); end = Math.min(end, end2); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts, currentOrds); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRef(); // must be this way } currentOrds.grow(currentOrds.length + 1); currentOrds.ints[currentOrds.length++] = ord; } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts, currentOrds); reader.close(); sorted.delete(); }