List of usage examples for org.apache.lucene.store ByteArrayDataOutput getPosition
public int getPosition()
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * /*from w w w . j ava 2 s .c o m*/ * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * //w w w.ja v a 2 s .co m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but it's a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:org.elasticsearch.common.util.ByteUtilsTests.java
License:Apache License
public void testVLong() throws IOException { final long[] data = new long[atLeast(1000)]; for (int i = 0; i < data.length; ++i) { switch (randomInt(4)) { case 0:/*from w w w .j a v a 2s. co m*/ data[i] = 0; break; case 1: data[i] = Long.MAX_VALUE; break; case 2: data[i] = Long.MIN_VALUE; break; case 3: data[i] = randomInt(1 << randomIntBetween(2, 30)); break; case 4: data[i] = randomLong(); break; default: throw new AssertionError(); } } final byte[] encoded = new byte[ByteUtils.MAX_BYTES_VLONG * data.length]; ByteArrayDataOutput out = new ByteArrayDataOutput(encoded); for (int i = 0; i < data.length; ++i) { final int pos = out.getPosition(); ByteUtils.writeVLong(out, data[i]); if (data[i] < 0) { assertEquals(ByteUtils.MAX_BYTES_VLONG, out.getPosition() - pos); } } final ByteArrayDataInput in = new ByteArrayDataInput(encoded); for (int i = 0; i < data.length; ++i) { assertEquals(data[i], ByteUtils.readVLong(in)); } }
From source file:org.elasticsearch.index.mapper.BinaryRangeUtil.java
License:Apache License
static BytesRef encodeLongRanges(Set<RangeFieldMapper.Range> ranges) throws IOException { List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges); Comparator<RangeFieldMapper.Range> fromComparator = Comparator .comparingLong(range -> ((Number) range.from).longValue()); Comparator<RangeFieldMapper.Range> toComparator = Comparator .comparingLong(range -> ((Number) range.to).longValue()); sortedRanges.sort(fromComparator.thenComparing(toComparator)); final byte[] encoded = new byte[5 + (9 * 2) * sortedRanges.size()]; ByteArrayDataOutput out = new ByteArrayDataOutput(encoded); out.writeVInt(sortedRanges.size());//from ww w .ja v a2 s . co m for (RangeFieldMapper.Range range : sortedRanges) { byte[] encodedFrom = encodeLong(((Number) range.from).longValue()); out.writeBytes(encodedFrom, encodedFrom.length); byte[] encodedTo = encodeLong(((Number) range.to).longValue()); out.writeBytes(encodedTo, encodedTo.length); } return new BytesRef(encoded, 0, out.getPosition()); }
From source file:org.elasticsearch.index.mapper.BinaryRangeUtil.java
License:Apache License
static BytesRef encodeDoubleRanges(Set<RangeFieldMapper.Range> ranges) throws IOException { List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges); Comparator<RangeFieldMapper.Range> fromComparator = Comparator .comparingDouble(range -> ((Number) range.from).doubleValue()); Comparator<RangeFieldMapper.Range> toComparator = Comparator .comparingDouble(range -> ((Number) range.to).doubleValue()); sortedRanges.sort(fromComparator.thenComparing(toComparator)); final byte[] encoded = new byte[5 + (8 * 2) * sortedRanges.size()]; ByteArrayDataOutput out = new ByteArrayDataOutput(encoded); out.writeVInt(sortedRanges.size());//from w ww .j a v a 2 s. c o m for (RangeFieldMapper.Range range : sortedRanges) { byte[] encodedFrom = encodeDouble(((Number) range.from).doubleValue()); out.writeBytes(encodedFrom, encodedFrom.length); byte[] encodedTo = encodeDouble(((Number) range.to).doubleValue()); out.writeBytes(encodedTo, encodedTo.length); } return new BytesRef(encoded, 0, out.getPosition()); }
From source file:org.elasticsearch.index.mapper.BinaryRangeUtil.java
License:Apache License
static BytesRef encodeFloatRanges(Set<RangeFieldMapper.Range> ranges) throws IOException { List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges); Comparator<RangeFieldMapper.Range> fromComparator = Comparator .comparingDouble(range -> ((Number) range.from).floatValue()); Comparator<RangeFieldMapper.Range> toComparator = Comparator .comparingDouble(range -> ((Number) range.to).floatValue()); sortedRanges.sort(fromComparator.thenComparing(toComparator)); final byte[] encoded = new byte[5 + (4 * 2) * sortedRanges.size()]; ByteArrayDataOutput out = new ByteArrayDataOutput(encoded); out.writeVInt(sortedRanges.size());// w ww.j a v a 2 s . c o m for (RangeFieldMapper.Range range : sortedRanges) { byte[] encodedFrom = encodeFloat(((Number) range.from).floatValue()); out.writeBytes(encodedFrom, encodedFrom.length); byte[] encodedTo = encodeFloat(((Number) range.to).floatValue()); out.writeBytes(encodedTo, encodedTo.length); } return new BytesRef(encoded, 0, out.getPosition()); }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * //w w w .ja va 2 s. c o m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Character>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRef scratch = new BytesRef(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // TODO: add test and fix zero-affix handling! String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Character> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add((char) currentAffix); currentAffix++; } }