List of usage examples for org.apache.lucene.util.fst IntSequenceOutputs getSingleton
public static IntSequenceOutputs getSingleton()
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files./*ww w . j av a 2 s. co m*/ * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 File aff = File.createTempFile("affix", "aff", tempDir); OutputStream out = new BufferedOutputStream(new FileOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; try { // copy contents of affix stream to temp file final byte[] buffer = new byte[1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(new FileInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(new FileInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); aff.delete(); } }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; }/*w ww . java 2 s . c o m*/ builder.add(scratch.get(), output); } return builder.finish(); }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files.//ww w.j a v a 2 s. c o m * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 Path aff = Files.createTempFile(tempDir, "affix", "aff"); OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; boolean success = false; try { // copy contents of affix stream to temp file final byte[] buffer = new byte[1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(Files.newInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(Files.newInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed morphAliases = null; // no longer needed success = true; } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); if (success) { Files.delete(aff); } else { IOUtils.deleteFilesIgnoringExceptions(aff); } } }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files.//from www . j av a2 s . c o m * You have to close the provided InputStreams yourself. * * @param affix InputStream for reading the hunspell affix file (won't be closed). * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). * @throws IOException Can be thrown while reading from the InputStreams * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.add(new BytesRef()); // no flags -> ord 0 File aff = File.createTempFile("affix", "aff", tempDir); OutputStream out = new BufferedOutputStream(new FileOutputStream(aff)); InputStream aff1 = null; InputStream aff2 = null; try { // copy contents of affix stream to temp file final byte[] buffer = new byte[1024 * 8]; int len; while ((len = affix.read(buffer)) > 0) { out.write(buffer, 0, len); } out.close(); // pass 1: get encoding aff1 = new BufferedInputStream(new FileInputStream(aff)); String encoding = getDictionaryEncoding(aff1); // pass 2: parse affixes CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(new FileInputStream(aff)); readAffixFile(aff2, decoder); // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); Builder<IntsRef> b = new Builder<>(FST.INPUT_TYPE.BYTE4, o); readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); aliases = null; // no longer needed } finally { IOUtils.closeWhileHandlingException(out, aff1, aff2); aff.delete(); } }
From source file:stemmer.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRef scratch = new IntsRef(); for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Character> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Character c : entries) { output.ints[output.length++] = c; }//from w w w. j a v a2s. com builder.add(scratch, output); } return builder.finish(); }