List of usage examples for org.apache.lucene.store ByteArrayDataOutput ByteArrayDataOutput
public ByteArrayDataOutput(byte[] bytes, int offset, int len)
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * /*w w w.j a v a2 s .c o m*/ * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * // www.j av a2 s .c o m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but it's a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * //from w w w . j a va2s . co m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Character>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRef scratch = new BytesRef(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // TODO: add test and fix zero-affix handling! String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Character> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add((char) currentAffix); currentAffix++; } }