Example usage for org.apache.lucene.util IntsRefBuilder append

List of usage examples for org.apache.lucene.util IntsRefBuilder append

Introduction

In this page you can find the example usage for org.apache.lucene.util IntsRefBuilder append.

Prototype

public void append(int i) 

Source Link

Document

Append the provided int to this buffer.

Usage

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *//from  w ww.j a va 2s  .  c o m
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      File unsorted = File.createTempFile("unsorted", "dat", tempDir);
      ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
      boolean success = false;
      try {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
          success = true;
      } finally {
          if (success) {
              IOUtils.close(writer);
          } else {
              IOUtils.closeWhileHandlingException(writer);
          }
      }
      File sorted = File.createTempFile("sorted", "dat", tempDir);

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      sorter.sort(unsorted, sorted);
      unsorted.delete();

      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      BytesRefBuilder scratchLine = new BytesRefBuilder();

      // TODO: the flags themselves can be double-chars (long) or also numeric
      // either way the trick is to encode them as char... but they must be parsed differently

      String currentEntry = null;
      IntsRefBuilder currentOrds = new IntsRefBuilder();

      String line;
      while (reader.read(scratchLine)) {
          line = scratchLine.get().utf8ToString();
          String entry;
          char wordForm[];
          int end;

          int flagSep = line.indexOf(FLAG_SEPARATOR);
          if (flagSep == -1) {
              wordForm = NOFLAGS;
              end = line.indexOf(MORPH_SEPARATOR);
              entry = line.substring(0, end);
          } else {
              end = line.indexOf(MORPH_SEPARATOR);
              String flagPart = line.substring(flagSep + 1, end);
              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              wordForm = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(wordForm);
              entry = line.substring(0, flagSep);
          }
          // we possibly have morphological data
          int stemExceptionID = 0;
          if (hasStemExceptions && end + 1 < line.length()) {
              String stemException = parseStemException(line.substring(end + 1));
              if (stemException != null) {
                  if (stemExceptionCount == stemExceptions.length) {
                      int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                              RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                      stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                  }
                  stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                  stemExceptions[stemExceptionCount++] = stemException;
              }
          }

          int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
          if (cmp < 0) {
              throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
          } else {
              encodeFlags(flagsScratch, wordForm);
              int ord = flagLookup.add(flagsScratch.get());
              if (ord < 0) {
                  // already exists in our hash
                  ord = (-ord) - 1;
              }
              // finalize current entry, and switch "current" if necessary
              if (cmp > 0 && currentEntry != null) {
                  Util.toUTF32(currentEntry, scratchInts);
                  words.add(scratchInts.get(), currentOrds.get());
              }
              // swap current
              if (cmp > 0 || currentEntry == null) {
                  currentEntry = entry;
                  currentOrds = new IntsRefBuilder(); // must be this way
              }
              if (hasStemExceptions) {
                  currentOrds.append(ord);
                  currentOrds.append(stemExceptionID);
              } else {
                  currentOrds.append(ord);
              }
          }
      }

      // finalize last entry
      Util.toUTF32(currentEntry, scratchInts);
      words.add(scratchInts.get(), currentOrds.get());

      reader.close();
      sorted.delete();
  }

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   */*w ww  .  ja  v  a 2  s.com*/
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
  private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
          throws IOException {
      BytesRefBuilder flagsScratch = new BytesRefBuilder();
      IntsRefBuilder scratchInts = new IntsRefBuilder();

      StringBuilder sb = new StringBuilder();

      Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat");
      try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
          for (InputStream dictionary : dictionaries) {
              BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
              String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

              while ((line = lines.readLine()) != null) {
                  // wild and unpredictable code comment rules
                  if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#'
                          || line.charAt(0) == '\t') {
                      continue;
                  }
                  line = unescapeEntry(line);
                  // if we havent seen any stem exceptions, try to parse one
                  if (hasStemExceptions == false) {
                      int morphStart = line.indexOf(MORPH_SEPARATOR);
                      if (morphStart >= 0 && morphStart < line.length()) {
                          hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                      }
                  }
                  if (needsInputCleaning) {
                      int flagSep = line.indexOf(FLAG_SEPARATOR);
                      if (flagSep == -1) {
                          flagSep = line.indexOf(MORPH_SEPARATOR);
                      }
                      if (flagSep == -1) {
                          CharSequence cleansed = cleanInput(line, sb);
                          writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                      } else {
                          String text = line.substring(0, flagSep);
                          CharSequence cleansed = cleanInput(text, sb);
                          if (cleansed != sb) {
                              sb.setLength(0);
                              sb.append(cleansed);
                          }
                          sb.append(line.substring(flagSep));
                          writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                      }
                  } else {
                      writer.write(line.getBytes(StandardCharsets.UTF_8));
                  }
              }
          }
      }
      Path sorted = Files.createTempFile(tempDir, "sorted", "dat");

      OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
          BytesRef scratch1 = new BytesRef();
          BytesRef scratch2 = new BytesRef();

          @Override
          public int compare(BytesRef o1, BytesRef o2) {
              scratch1.bytes = o1.bytes;
              scratch1.offset = o1.offset;
              scratch1.length = o1.length;

              for (int i = scratch1.length - 1; i >= 0; i--) {
                  if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
                          || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                      scratch1.length = i;
                      break;
                  }
              }

              scratch2.bytes = o2.bytes;
              scratch2.offset = o2.offset;
              scratch2.length = o2.length;

              for (int i = scratch2.length - 1; i >= 0; i--) {
                  if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
                          || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                      scratch2.length = i;
                      break;
                  }
              }

              int cmp = scratch1.compareTo(scratch2);
              if (cmp == 0) {
                  // tie break on whole row
                  return o1.compareTo(o2);
              } else {
                  return cmp;
              }
          }
      });
      boolean success = false;
      try {
          sorter.sort(unsorted, sorted);
          success = true;
      } finally {
          if (success) {
              Files.delete(unsorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(unsorted);
          }
      }

      boolean success2 = false;
      ByteSequencesReader reader = new ByteSequencesReader(sorted);
      try {
          BytesRefBuilder scratchLine = new BytesRefBuilder();

          // TODO: the flags themselves can be double-chars (long) or also numeric
          // either way the trick is to encode them as char... but they must be parsed differently

          String currentEntry = null;
          IntsRefBuilder currentOrds = new IntsRefBuilder();

          String line;
          while (reader.read(scratchLine)) {
              line = scratchLine.get().utf8ToString();
              String entry;
              char wordForm[];
              int end;

              int flagSep = line.indexOf(FLAG_SEPARATOR);
              if (flagSep == -1) {
                  wordForm = NOFLAGS;
                  end = line.indexOf(MORPH_SEPARATOR);
                  entry = line.substring(0, end);
              } else {
                  end = line.indexOf(MORPH_SEPARATOR);
                  String flagPart = line.substring(flagSep + 1, end);
                  if (aliasCount > 0) {
                      flagPart = getAliasValue(Integer.parseInt(flagPart));
                  }

                  wordForm = flagParsingStrategy.parseFlags(flagPart);
                  Arrays.sort(wordForm);
                  entry = line.substring(0, flagSep);
              }
              // we possibly have morphological data
              int stemExceptionID = 0;
              if (hasStemExceptions && end + 1 < line.length()) {
                  String stemException = parseStemException(line.substring(end + 1));
                  if (stemException != null) {
                      if (stemExceptionCount == stemExceptions.length) {
                          int newSize = ArrayUtil.oversize(stemExceptionCount + 1,
                                  RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                          stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                      }
                      stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form
                      stemExceptions[stemExceptionCount++] = stemException;
                  }
              }

              int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
              if (cmp < 0) {
                  throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
              } else {
                  encodeFlags(flagsScratch, wordForm);
                  int ord = flagLookup.add(flagsScratch.get());
                  if (ord < 0) {
                      // already exists in our hash
                      ord = (-ord) - 1;
                  }
                  // finalize current entry, and switch "current" if necessary
                  if (cmp > 0 && currentEntry != null) {
                      Util.toUTF32(currentEntry, scratchInts);
                      words.add(scratchInts.get(), currentOrds.get());
                  }
                  // swap current
                  if (cmp > 0 || currentEntry == null) {
                      currentEntry = entry;
                      currentOrds = new IntsRefBuilder(); // must be this way
                  }
                  if (hasStemExceptions) {
                      currentOrds.append(ord);
                      currentOrds.append(stemExceptionID);
                  } else {
                      currentOrds.append(ord);
                  }
              }
          }

          // finalize last entry
          Util.toUTF32(currentEntry, scratchInts);
          words.add(scratchInts.get(), currentOrds.get());
          success2 = true;
      } finally {
          IOUtils.closeWhileHandlingException(reader);
          if (success2) {
              Files.delete(sorted);
          } else {
              IOUtils.deleteFilesIgnoringExceptions(sorted);
          }
      }
  }