List of usage examples for org.apache.lucene.util ArrayUtil grow
public static char[] grow(char[] array, int minSize)
From source file:com.flonkings.ml.utils.IOUtils.java
License:Apache License
public static DataSet loadCSVDataSet(InputStream stream) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(stream, Charset.forName("UTF-8"))); String line;//w w w . j a v a2 s . c o m double[][] data = new double[1][]; double[] labels = new double[1]; int offset = 0; while ((line = reader.readLine()) != null) { String[] split = line.split(","); if (labels.length == offset) { labels = ArrayUtil.grow(labels, offset + 1); double[][] newData = new double[labels.length][]; System.arraycopy(data, 0, newData, 0, data.length); data = newData; } data[offset] = new double[split.length - 1]; for (int i = 0; i < split.length - 1; i++) { data[offset][i] = Double.parseDouble(split[i]); } labels[offset++] = Double.parseDouble(split[split.length - 1]); } return new DataSet(shrink(data, offset), shrink(labels, offset)); }
From source file:com.lucure.core.codec.LucurePostingsWriter.java
License:Apache License
/** Add a new position & payload */ @Override// w ww.j a va 2 s . c o m public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { // if (DEBUG) { // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: "")); // } posDeltaBuffer[posBufferUpto] = position - lastPosition; if (fieldHasPayloads) { if (payload == null || payload.length == 0) { // no payload payloadLengthBuffer[posBufferUpto] = 0; } else { payloadLengthBuffer[posBufferUpto] = payload.length; if (payloadByteUpto + payload.length > payloadBytes.length) { payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); } System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); payloadByteUpto += payload.length; } } if (fieldHasOffsets) { assert startOffset >= lastStartOffset; assert endOffset >= startOffset; offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; lastStartOffset = startOffset; } posBufferUpto++; lastPosition = position; if (posBufferUpto == BLOCK_SIZE) { // if (DEBUG) { // System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer()); // } forUtil.writeBlock(posDeltaBuffer, encoded, posOut); if (fieldHasPayloads) { forUtil.writeBlock(payloadLengthBuffer, encoded, payOut); payOut.writeVInt(payloadByteUpto); payOut.writeBytes(payloadBytes, 0, payloadByteUpto); payloadByteUpto = 0; } if (fieldHasOffsets) { forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut); forUtil.writeBlock(offsetLengthBuffer, encoded, payOut); } posBufferUpto = 0; } }
From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java
License:Apache License
private void seekToStartTerm(BytesRef target) throws IOException { assert currentFrame.ord == 0; if (term.length < target.length) { term.bytes = ArrayUtil.grow(term.bytes, target.length); }//from w ww .ja va2 s.c o m FST.Arc<BytesRef> arc = arcs[0]; assert arc == currentFrame.arc; for (int idx = 0; idx <= target.length; idx++) { while (true) { final int savNextEnt = currentFrame.nextEnt; final int savePos = currentFrame.suffixesReader.getPosition(); final int saveStartBytePos = currentFrame.startBytePos; final int saveSuffix = currentFrame.suffix; final long saveLastSubFP = currentFrame.lastSubFP; final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm; final boolean isSubBlock = currentFrame.next(); term.length = currentFrame.prefix + currentFrame.suffix; if (term.bytes.length < term.length) { term.bytes = ArrayUtil.grow(term.bytes, term.length); } System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); if (isSubBlock && StringHelper.startsWith(target, term)) { // Recurse currentFrame = pushFrame(getState()); break; } else { final int cmp = term.compareTo(target); if (cmp < 0) { if (currentFrame.nextEnt == currentFrame.entCount) { if (!currentFrame.isLastInFloor) { // Advance to next floor block currentFrame.loadNextFloorBlock(); continue; } else { return; } } continue; } else if (cmp == 0) { if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) { continue; } return; } else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) { // Fallback to prior entry: the semantics of // this method is that the first call to // next() will return the term after the // requested term currentFrame.nextEnt = savNextEnt; currentFrame.lastSubFP = saveLastSubFP; currentFrame.startBytePos = saveStartBytePos; currentFrame.suffix = saveSuffix; currentFrame.suffixesReader.setPosition(savePos); currentFrame.termState.termBlockOrd = saveTermBlockOrd; currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm; System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); term.length = currentFrame.prefix + currentFrame.suffix; // If the last entry was a block we don't // need to bother recursing and pushing to // the last term under it because the first // next() will simply skip the frame anyway return; } } } } assert false; }
From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java
License:Apache License
private void copyTerm() { final int len = currentFrame.prefix + currentFrame.suffix; if (term.bytes.length < len) { term.bytes = ArrayUtil.grow(term.bytes, len); }//from w w w.j a v a 2 s . c om System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); term.length = len; }
From source file:com.rocana.lucene.codec.v1.RocanaStats.java
License:Apache License
void startBlock(RocanaSegmentTermsEnumFrame frame, boolean isFloor) { totalBlockCount++;/*w w w . ja v a 2s. c o m*/ if (isFloor) { if (frame.fp == frame.fpOrig) { floorBlockCount++; } floorSubBlockCount++; } else { nonFloorBlockCount++; } if (blockCountByPrefixLen.length <= frame.prefix) { blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1 + frame.prefix); } blockCountByPrefixLen[frame.prefix]++; startBlockCount++; totalBlockSuffixBytes += frame.suffixesReader.length(); totalBlockStatsBytes += frame.statsReader.length(); }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * /* ww w .java 2 s. c om*/ * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:elhuyar.bilakit.Stemmer.java
License:Apache License
/** * Find the stem(s) of the provided word * //from www.j a v a2 s . co m * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> stem(char word[], int length) { if (dictionary.needsInputCleaning) { scratchSegment.setLength(0); scratchSegment.append(word, 0, length); CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); length = segment.length(); segment.getChars(0, length, scratchBuffer, 0); word = scratchBuffer; } int caseType = caseOf(word, length); if (caseType == UPPER_CASE) { // upper: union exact, title, lower caseFoldTitle(word, length); caseFoldLower(titleBuffer, length); List<CharsRef> list = doStem(word, length, false); list.addAll(doStem(titleBuffer, length, true)); list.addAll(doStem(lowerBuffer, length, true)); return list; } else if (caseType == TITLE_CASE) { // title: union exact, lower caseFoldLower(word, length); List<CharsRef> list = doStem(word, length, false); list.addAll(doStem(lowerBuffer, length, true)); return list; } else { // exact match only return doStem(word, length, false); } }
From source file:elhuyar.bilakit.Stemmer.java
License:Apache License
/** folds titlecase variant of word to titleBuffer */ private void caseFoldTitle(char word[], int length) { titleBuffer = ArrayUtil.grow(titleBuffer, length); System.arraycopy(word, 0, titleBuffer, 0, length); for (int i = 1; i < length; i++) { titleBuffer[i] = dictionary.caseFold(titleBuffer[i]); }/*from w w w . jav a 2 s . c o m*/ }
From source file:elhuyar.bilakit.Stemmer.java
License:Apache License
/** folds lowercase variant of word (title cased) to lowerBuffer */ private void caseFoldLower(char word[], int length) { lowerBuffer = ArrayUtil.grow(lowerBuffer, length); System.arraycopy(word, 0, lowerBuffer, 0, length); lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]); }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * /* w w w . j a v a 2 s.c o m*/ * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but it's a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }