Example usage for org.apache.lucene.util ArrayUtil grow

List of usage examples for org.apache.lucene.util ArrayUtil grow

Introduction

In this page you can find the example usage for org.apache.lucene.util ArrayUtil grow.

Prototype

public static char[] grow(char[] array, int minSize) 

Source Link

Document

Returns an array whose size is at least minSize , generally over-allocating exponentially

Usage

From source file:com.flonkings.ml.utils.IOUtils.java

License:Apache License

public static DataSet loadCSVDataSet(InputStream stream) throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream, Charset.forName("UTF-8")));
    String line;//w w  w .  j a v  a2  s  .  c o  m
    double[][] data = new double[1][];
    double[] labels = new double[1];
    int offset = 0;
    while ((line = reader.readLine()) != null) {
        String[] split = line.split(",");
        if (labels.length == offset) {
            labels = ArrayUtil.grow(labels, offset + 1);
            double[][] newData = new double[labels.length][];
            System.arraycopy(data, 0, newData, 0, data.length);
            data = newData;
        }
        data[offset] = new double[split.length - 1];
        for (int i = 0; i < split.length - 1; i++) {
            data[offset][i] = Double.parseDouble(split[i]);
        }
        labels[offset++] = Double.parseDouble(split[split.length - 1]);
    }
    return new DataSet(shrink(data, offset), shrink(labels, offset));
}

From source file:com.lucure.core.codec.LucurePostingsWriter.java

License:Apache License

/** Add a new position & payload */
@Override//  w  ww.j  a  va 2 s .  c  o m
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
    // if (DEBUG) {
    //   System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: ""));
    // }
    posDeltaBuffer[posBufferUpto] = position - lastPosition;
    if (fieldHasPayloads) {
        if (payload == null || payload.length == 0) {
            // no payload
            payloadLengthBuffer[posBufferUpto] = 0;
        } else {
            payloadLengthBuffer[posBufferUpto] = payload.length;
            if (payloadByteUpto + payload.length > payloadBytes.length) {
                payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length);
            }
            System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length);
            payloadByteUpto += payload.length;
        }
    }

    if (fieldHasOffsets) {
        assert startOffset >= lastStartOffset;
        assert endOffset >= startOffset;
        offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset;
        offsetLengthBuffer[posBufferUpto] = endOffset - startOffset;
        lastStartOffset = startOffset;
    }

    posBufferUpto++;
    lastPosition = position;
    if (posBufferUpto == BLOCK_SIZE) {
        // if (DEBUG) {
        //   System.out.println("  write pos bulk block @ fp=" + posOut.getFilePointer());
        // }
        forUtil.writeBlock(posDeltaBuffer, encoded, posOut);

        if (fieldHasPayloads) {
            forUtil.writeBlock(payloadLengthBuffer, encoded, payOut);
            payOut.writeVInt(payloadByteUpto);
            payOut.writeBytes(payloadBytes, 0, payloadByteUpto);
            payloadByteUpto = 0;
        }
        if (fieldHasOffsets) {
            forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut);
            forUtil.writeBlock(offsetLengthBuffer, encoded, payOut);
        }
        posBufferUpto = 0;
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java

License:Apache License

private void seekToStartTerm(BytesRef target) throws IOException {
    assert currentFrame.ord == 0;
    if (term.length < target.length) {
        term.bytes = ArrayUtil.grow(term.bytes, target.length);
    }//from  w ww .ja va2  s.c o m
    FST.Arc<BytesRef> arc = arcs[0];
    assert arc == currentFrame.arc;

    for (int idx = 0; idx <= target.length; idx++) {

        while (true) {
            final int savNextEnt = currentFrame.nextEnt;
            final int savePos = currentFrame.suffixesReader.getPosition();
            final int saveStartBytePos = currentFrame.startBytePos;
            final int saveSuffix = currentFrame.suffix;
            final long saveLastSubFP = currentFrame.lastSubFP;
            final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
            final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm;

            final boolean isSubBlock = currentFrame.next();

            term.length = currentFrame.prefix + currentFrame.suffix;
            if (term.bytes.length < term.length) {
                term.bytes = ArrayUtil.grow(term.bytes, term.length);
            }
            System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes,
                    currentFrame.prefix, currentFrame.suffix);

            if (isSubBlock && StringHelper.startsWith(target, term)) {
                // Recurse
                currentFrame = pushFrame(getState());
                break;
            } else {
                final int cmp = term.compareTo(target);
                if (cmp < 0) {
                    if (currentFrame.nextEnt == currentFrame.entCount) {
                        if (!currentFrame.isLastInFloor) {
                            // Advance to next floor block
                            currentFrame.loadNextFloorBlock();
                            continue;
                        } else {
                            return;
                        }
                    }
                    continue;
                } else if (cmp == 0) {
                    if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) {
                        continue;
                    }
                    return;
                } else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) {
                    // Fallback to prior entry: the semantics of
                    // this method is that the first call to
                    // next() will return the term after the
                    // requested term
                    currentFrame.nextEnt = savNextEnt;
                    currentFrame.lastSubFP = saveLastSubFP;
                    currentFrame.startBytePos = saveStartBytePos;
                    currentFrame.suffix = saveSuffix;
                    currentFrame.suffixesReader.setPosition(savePos);
                    currentFrame.termState.termBlockOrd = saveTermBlockOrd;
                    currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm;
                    System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes,
                            currentFrame.prefix, currentFrame.suffix);
                    term.length = currentFrame.prefix + currentFrame.suffix;
                    // If the last entry was a block we don't
                    // need to bother recursing and pushing to
                    // the last term under it because the first
                    // next() will simply skip the frame anyway
                    return;
                }
            }
        }
    }

    assert false;
}

From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java

License:Apache License

private void copyTerm() {
    final int len = currentFrame.prefix + currentFrame.suffix;
    if (term.bytes.length < len) {
        term.bytes = ArrayUtil.grow(term.bytes, len);
    }//from  w  w w.j  a v  a 2 s  .  c  om
    System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix,
            currentFrame.suffix);
    term.length = len;
}

From source file:com.rocana.lucene.codec.v1.RocanaStats.java

License:Apache License

void startBlock(RocanaSegmentTermsEnumFrame frame, boolean isFloor) {
    totalBlockCount++;/*w  w  w  .  ja  v  a  2s.  c  o m*/
    if (isFloor) {
        if (frame.fp == frame.fpOrig) {
            floorBlockCount++;
        }
        floorSubBlockCount++;
    } else {
        nonFloorBlockCount++;
    }

    if (blockCountByPrefixLen.length <= frame.prefix) {
        blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1 + frame.prefix);
    }
    blockCountByPrefixLen[frame.prefix]++;
    startBlockCount++;
    totalBlockSuffixBytes += frame.suffixesReader.length();
    totalBlockStatsBytes += frame.statsReader.length();
}

From source file:elhuyar.bilakit.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * /*  ww w .java  2  s.  c om*/
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but its a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }

From source file:elhuyar.bilakit.Stemmer.java

License:Apache License

/**
 * Find the stem(s) of the provided word
 * //from www.j  a v  a2  s  . co m
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> stem(char word[], int length) {

    if (dictionary.needsInputCleaning) {
        scratchSegment.setLength(0);
        scratchSegment.append(word, 0, length);
        CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
        scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
        length = segment.length();
        segment.getChars(0, length, scratchBuffer, 0);
        word = scratchBuffer;
    }

    int caseType = caseOf(word, length);
    if (caseType == UPPER_CASE) {
        // upper: union exact, title, lower
        caseFoldTitle(word, length);
        caseFoldLower(titleBuffer, length);
        List<CharsRef> list = doStem(word, length, false);
        list.addAll(doStem(titleBuffer, length, true));
        list.addAll(doStem(lowerBuffer, length, true));
        return list;
    } else if (caseType == TITLE_CASE) {
        // title: union exact, lower
        caseFoldLower(word, length);
        List<CharsRef> list = doStem(word, length, false);
        list.addAll(doStem(lowerBuffer, length, true));
        return list;
    } else {
        // exact match only
        return doStem(word, length, false);
    }
}

From source file:elhuyar.bilakit.Stemmer.java

License:Apache License

/** folds titlecase variant of word to titleBuffer */
private void caseFoldTitle(char word[], int length) {
    titleBuffer = ArrayUtil.grow(titleBuffer, length);
    System.arraycopy(word, 0, titleBuffer, 0, length);
    for (int i = 1; i < length; i++) {
        titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
    }/*from   w w  w .  jav  a  2 s .  c o m*/
}

From source file:elhuyar.bilakit.Stemmer.java

License:Apache License

/** folds lowercase variant of word (title cased) to lowerBuffer */
private void caseFoldLower(char word[], int length) {
    lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
    System.arraycopy(word, 0, lowerBuffer, 0, length);
    lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
}

From source file:hunspell_stemmer.Dictionary.java

License:Apache License

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * /*  w w  w . j a v  a  2  s.c o  m*/
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader,
          String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips)
          throws IOException, ParseException {

      BytesRefBuilder scratch = new BytesRefBuilder();
      StringBuilder sb = new StringBuilder();
      String args[] = header.split("\\s+");

      boolean crossProduct = args[2].equals("Y");
      boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;

      int numLines = Integer.parseInt(args[3]);
      affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
      ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);

      for (int i = 0; i < numLines; i++) {
          assert affixWriter.getPosition() == currentAffix << 3;
          String line = reader.readLine();
          String ruleArgs[] = line.split("\\s+");

          // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
          // condition is optional
          if (ruleArgs.length < 4) {
              throw new ParseException("The affix file contains a rule with less than four elements: " + line,
                      reader.getLineNumber());
          }

          char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
          String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
          String affixArg = ruleArgs[3];
          char appendFlags[] = null;

          // first: parse continuation classes out of affix
          int flagSep = affixArg.lastIndexOf('/');
          if (flagSep != -1) {
              String flagPart = affixArg.substring(flagSep + 1);
              affixArg = affixArg.substring(0, flagSep);

              if (aliasCount > 0) {
                  flagPart = getAliasValue(Integer.parseInt(flagPart));
              }

              appendFlags = flagParsingStrategy.parseFlags(flagPart);
              Arrays.sort(appendFlags);
              twoStageAffix = true;
          }
          // zero affix -> empty string
          if ("0".equals(affixArg)) {
              affixArg = "";
          }

          String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
          // at least the gascon affix file has this issue
          if (condition.startsWith("[") && condition.indexOf(']') == -1) {
              condition = condition + "]";
          }
          // "dash hasn't got special meaning" (we must escape it)
          if (condition.indexOf('-') >= 0) {
              condition = escapeDash(condition);
          }

          final String regex;
          if (".".equals(condition)) {
              regex = ".*"; // Zero condition is indicated by dot
          } else if (condition.equals(strip)) {
              regex = ".*"; // TODO: optimize this better:
                            // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
                            // but this is complicated...
          } else {
              regex = String.format(Locale.ROOT, conditionPattern, condition);
          }

          // deduplicate patterns
          Integer patternIndex = seenPatterns.get(regex);
          if (patternIndex == null) {
              patternIndex = patterns.size();
              if (patternIndex > Short.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many patterns, please report this to dev@lucene.apache.org");
              }
              seenPatterns.put(regex, patternIndex);
              CharacterRunAutomaton pattern = new CharacterRunAutomaton(
                      new RegExp(regex, RegExp.NONE).toAutomaton());
              patterns.add(pattern);
          }

          Integer stripOrd = seenStrips.get(strip);
          if (stripOrd == null) {
              stripOrd = seenStrips.size();
              seenStrips.put(strip, stripOrd);
              if (stripOrd > Character.MAX_VALUE) {
                  throw new UnsupportedOperationException(
                          "Too many unique strips, please report this to dev@lucene.apache.org");
              }
          }

          if (appendFlags == null) {
              appendFlags = NOFLAGS;
          }

          encodeFlags(scratch, appendFlags);
          int appendFlagsOrd = flagLookup.add(scratch.get());
          if (appendFlagsOrd < 0) {
              // already exists in our hash
              appendFlagsOrd = (-appendFlagsOrd) - 1;
          } else if (appendFlagsOrd > Short.MAX_VALUE) {
              // this limit is probably flexible, but it's a good sanity check too
              throw new UnsupportedOperationException(
                      "Too many unique append flags, please report this to dev@lucene.apache.org");
          }

          affixWriter.writeShort((short) flag);
          affixWriter.writeShort((short) stripOrd.intValue());
          // encode crossProduct into patternIndex
          int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
          affixWriter.writeShort((short) patternOrd);
          affixWriter.writeShort((short) appendFlagsOrd);

          if (needsInputCleaning) {
              CharSequence cleaned = cleanInput(affixArg, sb);
              affixArg = cleaned.toString();
          }

          if (isSuffix) {
              affixArg = new StringBuilder(affixArg).reverse().toString();
          }

          List<Integer> list = affixes.get(affixArg);
          if (list == null) {
              list = new ArrayList<>();
              affixes.put(affixArg, list);
          }
          list.add(currentAffix);
          currentAffix++;
      }
  }