List of usage examples for org.apache.lucene.util BytesRefBuilder get
public BytesRef get()
From source file:com.github.flaxsearch.util.BytesRefUtils.java
License:Apache License
private static Function<String, BytesRef> getDecoder(String type) { switch (type.toLowerCase(Locale.ROOT)) { case "base64": return s -> new BytesRef(Base64.getUrlDecoder().decode(s.getBytes(Charset.defaultCharset()))); case "utf8": return BytesRef::new; case "int": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(s), 0, builder); return builder.get(); };//from w w w.ja va2s . c o m case "long": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.longToPrefixCoded(Long.parseLong(s), 0, builder); return builder.get(); }; case "float": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, builder); return builder.get(); }; case "double": return s -> { BytesRefBuilder builder = new BytesRefBuilder(); LegacyNumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, builder); return builder.get(); }; default: throw new IllegalArgumentException("Unknown decoder type: " + type); } }
From source file:com.google.gerrit.lucene.QueryBuilder.java
License:Apache License
private static Term intTerm(String name, int value) { BytesRefBuilder builder = new BytesRefBuilder(); NumericUtils.intToPrefixCodedBytes(value, 0, builder); return new Term(name, builder.get()); }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateFST(SortedMap<String, Double> weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, Double> entry : weights.entrySet()) { scratchBytes.copyChars(entry.getKey()); fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue()); }// w w w. jav a 2 s . c om fst = fstBuilder.finish(); }
From source file:com.querydsl.lucene5.LuceneSerializer.java
License:Apache License
private BytesRef convertNumber(Number number) { if (Integer.class.isInstance(number) || Byte.class.isInstance(number) || Short.class.isInstance(number)) { BytesRefBuilder ref = new BytesRefBuilder(); NumericUtils.intToPrefixCoded(number.intValue(), 0, ref); return ref.get(); } else if (Double.class.isInstance(number) || BigDecimal.class.isInstance(number)) { BytesRefBuilder ref = new BytesRefBuilder(); long l = NumericUtils.doubleToSortableLong(number.doubleValue()); NumericUtils.longToPrefixCoded(l, 0, ref); return ref.get(); } else if (Long.class.isInstance(number) || BigInteger.class.isInstance(number)) { BytesRefBuilder ref = new BytesRefBuilder(); NumericUtils.longToPrefixCoded(number.longValue(), 0, ref); return ref.get(); } else if (Float.class.isInstance(number)) { BytesRefBuilder ref = new BytesRefBuilder(); int i = NumericUtils.floatToSortableInt(number.floatValue()); NumericUtils.intToPrefixCoded(i, 0, ref); return ref.get(); } else {/* ww w. j a va2 s .c om*/ throw new IllegalArgumentException("Unsupported numeric type " + number.getClass().getName()); } }
From source file:com.stratio.cassandra.lucene.key.TokenMapper.java
License:Apache License
/** * Returns the {@link BytesRef} indexing value of the specified Murmur3 partitioning {@link Token}. * * @param token a Murmur3 token/* w ww .ja v a 2s .co m*/ * @return the {@code token}'s indexing value */ private static BytesRef bytesRef(Token token) { Long value = value(token); BytesRefBuilder bytesRef = new BytesRefBuilder(); NumericUtils.longToPrefixCoded(value, 0, bytesRef); return bytesRef.get(); }
From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java
License:Apache License
@Override public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim) throws IOException { TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val); try {// ww w.j av a 2s. co m ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE; while (ts.incrementToken()) { if (matchType.equals(typeAtt.type())) { BytesRefBuilder ret = new BytesRefBuilder(); ret.copyChars(termAtt.toString()); if (!strict || appendExtraDelim) { ret.append(delimBytes, 0, delimBytes.length); } return ret.get(); } } return new BytesRef(BytesRef.EMPTY_BYTES); } finally { ts.close(); } }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * // ww w . j a v a2s .c o m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but its a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map *//from w ww . ja va2 s .c o m * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); reader.close(); sorted.delete(); }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Parses a specific affix rule putting the result into the provided affix map * //w w w . j a va 2 s. c o m * @param affixes Map where the result of the parsing will be put * @param header Header line of the affix rule * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException { BytesRefBuilder scratch = new BytesRefBuilder(); StringBuilder sb = new StringBuilder(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = Integer.parseInt(args[3]); affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); for (int i = 0; i < numLines; i++) { assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); // from the manpage: PFX flag stripping prefix [condition [morphological_fields...]] // condition is optional if (ruleArgs.length < 4) { throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber()); } char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; char appendFlags[] = null; // first: parse continuation classes out of affix int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); affixArg = affixArg.substring(0, flagSep); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); twoStageAffix = true; } // zero affix -> empty string if ("0".equals(affixArg)) { affixArg = ""; } String condition = ruleArgs.length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue if (condition.startsWith("[") && condition.indexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.indexOf('-') >= 0) { condition = escapeDash(condition); } final String regex; if (".".equals(condition)) { regex = ".*"; // Zero condition is indicated by dot } else if (condition.equals(strip)) { regex = ".*"; // TODO: optimize this better: // if we remove 'strip' from condition, we don't have to append 'strip' to check it...! // but this is complicated... } else { regex = String.format(Locale.ROOT, conditionPattern, condition); } // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); if (patternIndex > Short.MAX_VALUE) { throw new UnsupportedOperationException( "Too many patterns, please report this to dev@lucene.apache.org"); } seenPatterns.put(regex, patternIndex); CharacterRunAutomaton pattern = new CharacterRunAutomaton( new RegExp(regex, RegExp.NONE).toAutomaton()); patterns.add(pattern); } Integer stripOrd = seenStrips.get(strip); if (stripOrd == null) { stripOrd = seenStrips.size(); seenStrips.put(strip, stripOrd); if (stripOrd > Character.MAX_VALUE) { throw new UnsupportedOperationException( "Too many unique strips, please report this to dev@lucene.apache.org"); } } if (appendFlags == null) { appendFlags = NOFLAGS; } encodeFlags(scratch, appendFlags); int appendFlagsOrd = flagLookup.add(scratch.get()); if (appendFlagsOrd < 0) { // already exists in our hash appendFlagsOrd = (-appendFlagsOrd) - 1; } else if (appendFlagsOrd > Short.MAX_VALUE) { // this limit is probably flexible, but it's a good sanity check too throw new UnsupportedOperationException( "Too many unique append flags, please report this to dev@lucene.apache.org"); } affixWriter.writeShort((short) flag); affixWriter.writeShort((short) stripOrd.intValue()); // encode crossProduct into patternIndex int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); affixWriter.writeShort((short) patternOrd); affixWriter.writeShort((short) appendFlagsOrd); if (needsInputCleaning) { CharSequence cleaned = cleanInput(affixArg, sb); affixArg = cleaned.toString(); } if (isSuffix) { affixArg = new StringBuilder(affixArg).reverse().toString(); } List<Integer> list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } list.add(currentAffix); currentAffix++; } }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map */* w ww . j a v a 2 s. co m*/ * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat"); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } } Path sorted = Files.createTempFile(tempDir, "sorted", "dat"); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); boolean success = false; try { sorter.sort(unsorted, sorted); success = true; } finally { if (success) { Files.delete(unsorted); } else { IOUtils.deleteFilesIgnoringExceptions(unsorted); } } boolean success2 = false; ByteSequencesReader reader = new ByteSequencesReader(sorted); try { BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); success2 = true; } finally { IOUtils.closeWhileHandlingException(reader); if (success2) { Files.delete(sorted); } else { IOUtils.deleteFilesIgnoringExceptions(sorted); } } }