Example usage for org.apache.lucene.util.fst Util toUTF32

List of usage examples for org.apache.lucene.util.fst Util toUTF32

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Util toUTF32.

Prototype

public static IntsRef toUTF32(CharSequence s, IntsRefBuilder scratch) 

Source Link

Document

Decodes the Unicode codepoints from the provided CharSequence and places them in the provided scratch IntsRef, which must not be null, returning it.

Usage

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Integer>> affixes) throws IOException {
      IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
      Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
      IntsRefBuilder scratch = new IntsRefBuilder();
      for (Map.Entry<String, List<Integer>> entry : affixes.entrySet()) {
          Util.toUTF32(entry.getKey(), scratch);
          List<Integer> entries = entry.getValue();
          IntsRef output = new IntsRef(entries.size());
          for (Integer c : entries) {
              output.ints[output.length++] = c;
          }//from ww  w .  ja  v  a2s.  co m
          builder.add(scratch.get(), output);
      }
      return builder.finish();
  }

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *//  w  w w . j  a v  a2 s.  c  o m
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      File unsorted = File.createTempFile("unsorted", "dat", tempDir);
      ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
      boolean success = false;
      try {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
          success = true;
      } finally {
          if (success) {
              IOUtils.close(writer);
          } else {
              IOUtils.closeWhileHandlingException(writer);
          }
      }
      File sorted = File.createTempFile("sorted", "dat", tempDir);

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      sorter.sort(unsorted, sorted);
      unsorted.delete();

      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      BytesRefBuilder scratchLine = new BytesRefBuilder();

      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently

      String currentEntry = null;
      IntsRefBuilder currentOrds = new IntsRefBuilder();

      String line;
      while (reader.read(scratchLine)) {
          line = scratchLine.get().utf8ToString();
          String entry;
          char wordForm[];
          int end;

          int flagSep = line.indexOf(FLAG_SEPARATOR);
          if (flagSep == -1) {
              wordForm = NOFLAGS;
              end = line.indexOf(MORPH_SEPARATOR);
              entry = line.substring(0, end);
          } else {
              end = line.indexOf(MORPH_SEPARATOR);
              String flagPart = line.substring(flagSep + 1, end);
              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              wordForm = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(wordForm);
              entry = line.substring(0, flagSep);
          }
          // we possibly have morphological data
          int stemExceptionID = 0;
          if (hasStemExceptions && end + 1 < line.length()) {
              String stemException = parseStemException(line.substring(end + 1));
              if (stemException != null) {
                  if (stemExceptionCount == stemExceptions.length) {
                      int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                              RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                      stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                  }
                  stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                  stemExceptions[stemExceptionCount++] = stemException;
              }
          }

          int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
          if (cmp < 0) {
              throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
          } else {
              encodeFlags(flagsScratch, wordForm);
              int ord = flagLookup.add(flagsScratch.get());
              if (ord < 0) {
                  // already exists in our hash
                  ord = (-ord) - 1;
              }
              // finalize current entry, and switch "current" if necessary
              if (cmp > 0 && currentEntry != null) {
                  Util.toUTF32(currentEntry, scratchInts);
                  words.add(scratchInts.get(), currentOrds.get());
              }
              // swap current
              if (cmp > 0 || currentEntry == null) {
                  currentEntry = entry;
                  currentOrds = new IntsRefBuilder(); // must be this way
              }
              if (hasStemExceptions) {
                  currentOrds.append(ord);
                  currentOrds.append(stemExceptionID);
              } else {
                  currentOrds.append(ord);
              }
          }
      }

      // finalize last entry
      Util.toUTF32(currentEntry, scratchInts);
      words.add(scratchInts.get(), currentOrds.get());

      reader.close();
      sorted.delete();
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   */*from  w  w w  .java  2  s.  c o  m*/
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat");
      try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
      }
      Path sorted = Files.createTempFile(tempDir, "sorted", "dat");

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      boolean success = false;
      try {
          sorter.sort(unsorted, sorted);
          success = true;
      } finally {
          if (success) {
              Files.delete(unsorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(unsorted);
          }
      }

      boolean success2 = false;
      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      try {
          BytesRefBuilder scratchLine = new BytesRefBuilder();

          // TODO: the flags themselves can be double-chars (long) or also numeric
          // either way the trick is to encode them as char... but they must be parsed differently

          String currentEntry = null;
          IntsRefBuilder currentOrds = new IntsRefBuilder();

          String line;
          while (reader.read(scratchLine)) {
              line = scratchLine.get().utf8ToString();
              String entry;
              char wordForm[];
              int end;

              int flagSep = line.indexOf(FLAG_SEPARATOR);
              if (flagSep == -1) {
                  wordForm = NOFLAGS;
                  end = line.indexOf(MORPH_SEPARATOR);
                  entry = line.substring(0, end);
              } else {
                  end = line.indexOf(MORPH_SEPARATOR);
                  String flagPart = line.substring(flagSep + 1, end);
                  if (aliasCount > 0) {
                      flagPart = getAliasValue(Integer.parseInt(flagPart));
                  }

                  wordForm = flagParsingStrategy.parseFlags(flagPart);
                  Arrays.sort(wordForm);
                  entry = line.substring(0, flagSep);
              }
              // we possibly have morphological data
              int stemExceptionID = 0;
              if (hasStemExceptions && end + 1 < line.length()) {
                  String stemException = parseStemException(line.substring(end + 1));
                  if (stemException != null) {
                      if (stemExceptionCount == stemExceptions.length) {
                          int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                                  RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                          stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                      }
                      stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                      stemExceptions[stemExceptionCount++] = stemException;
                  }
              }

              int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
              if (cmp < 0) {
                  throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
              } else {
                  encodeFlags(flagsScratch, wordForm);
                  int ord = flagLookup.add(flagsScratch.get());
                  if (ord < 0) {
                      // already exists in our hash
                      ord = (-ord) - 1;
                  }
                  // finalize current entry, and switch "current" if necessary
                  if (cmp > 0 && currentEntry != null) {
                      Util.toUTF32(currentEntry, scratchInts);
                      words.add(scratchInts.get(), currentOrds.get());
                  }
                  // swap current
                  if (cmp > 0 || currentEntry == null) {
                      currentEntry = entry;
                      currentOrds = new IntsRefBuilder(); // must be this way
                  }
                  if (hasStemExceptions) {
                      currentOrds.append(ord);
                      currentOrds.append(stemExceptionID);
                  } else {
                      currentOrds.append(ord);
                  }
              }
          }

          // finalize last entry
          Util.toUTF32(currentEntry, scratchInts);
          words.add(scratchInts.get(), currentOrds.get());
          success2 = true;
      } finally {
          IOUtils.closeWhileHandlingException(reader);
          if (success2) {
              Files.delete(sorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(sorted);
          }
      }
  }

From source file:stemmer.Dictionary.java

License:Apache License

private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException {
    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);

    IntsRef scratch = new IntsRef();
    for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) {
        Util.toUTF32(entry.getKey(), scratch);
        List<Character> entries = entry.getValue();
        IntsRef output = new IntsRef(entries.size());
        for (Character c : entries) {
            output.ints[output.length++] = c;
        }/*from  w  ww . ja  v  a  2s.  c o  m*/
        builder.add(scratch, output);
    }
    return builder.finish();
}

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Reads the dictionary file through the provided InputStreams, building up the words map
 *
 * @param dictionaries InputStreams to read the dictionary file through
 * @param decoder CharsetDecoder used to decode the contents of the file
 * @throws IOException Can be thrown while reading from the file
 *///  w ww .j  av a 2 s .c o  m
private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
        throws IOException {
    BytesRef flagsScratch = new BytesRef();
    IntsRef scratchInts = new IntsRef();

    StringBuilder sb = new StringBuilder();

    File unsorted = File.createTempFile("unsorted", "dat", tempDir);
    ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
    boolean success = false;
    try {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

            while ((line = lines.readLine()) != null) {
                line = unescapeEntry(line);
                if (needsInputCleaning) {
                    int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        success = true;
    } finally {
        if (success) {
            IOUtils.close(writer);
        } else {
            IOUtils.closeWhileHandlingException(writer);
        }
    }
    File sorted = File.createTempFile("sorted", "dat", tempDir);

    OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
        BytesRef scratch1 = new BytesRef();
        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;

            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }

            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;

            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }

            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    sorter.sort(unsorted, sorted);
    unsorted.delete();

    ByteSequencesReader reader = new ByteSequencesReader(sorted);
    BytesRef scratchLine = new BytesRef();

    // TODO: the flags themselves can be double-chars (long) or also numeric
    // either way the trick is to encode them as char... but they must be parsed differently

    String currentEntry = null;
    IntsRef currentOrds = new IntsRef();

    String line;
    while (reader.read(scratchLine)) {
        line = scratchLine.utf8ToString();
        String entry;
        char wordForm[];

        int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
        if (flagSep == -1) {
            wordForm = NOFLAGS;
            entry = line;
        } else {
            // note, there can be comments (morph description) after a flag.
            // we should really look for any whitespace: currently just tab and space
            int end = line.indexOf('\t', flagSep);
            if (end == -1)
                end = line.length();
            int end2 = line.indexOf(' ', flagSep);
            if (end2 == -1)
                end2 = line.length();
            end = Math.min(end, end2);

            String flagPart = line.substring(flagSep + 1, end);
            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }

            wordForm = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(wordForm);
            entry = line.substring(0, flagSep);
        }

        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
        if (cmp < 0) {
            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
        } else {
            encodeFlags(flagsScratch, wordForm);
            int ord = flagLookup.add(flagsScratch);
            if (ord < 0) {
                // already exists in our hash
                ord = (-ord) - 1;
            }
            // finalize current entry, and switch "current" if necessary
            if (cmp > 0 && currentEntry != null) {
                Util.toUTF32(currentEntry, scratchInts);
                words.add(scratchInts, currentOrds);
            }
            // swap current
            if (cmp > 0 || currentEntry == null) {
                currentEntry = entry;
                currentOrds = new IntsRef(); // must be this way
            }
            currentOrds.grow(currentOrds.length + 1);
            currentOrds.ints[currentOrds.length++] = ord;
        }
    }

    // finalize last entry
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts, currentOrds);

    reader.close();
    sorted.delete();
}