List of usage examples for org.apache.lucene.util IntsRef IntsRef
public IntsRef()
From source file:org.elasticsearch.index.fielddata.plain.FSTBytesIndexFieldData.java
License:Apache License
@Override public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { AtomicReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); FSTBytesAtomicFieldData data = null; // TODO: Use an actual estimator to estimate before loading. NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker()); if (terms == null) { data = FSTBytesAtomicFieldData.empty(reader.maxDoc()); estimator.afterLoad(null, data.getMemorySizeInBytes()); return data; }/* ww w .j av a2 s .c o m*/ PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>( INPUT_TYPE.BYTE1, outputs); final IntsRef scratch = new IntsRef(); final long numTerms; if (regex == null && frequency == null) { numTerms = terms.size(); } else { numTerms = -1; } final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat( "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO); OrdinalsBuilder builder = new OrdinalsBuilder(numTerms, reader.maxDoc(), acceptableTransientOverheadRatio); boolean success = false; try { // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support // empty strings twice. ie. them merge fails for long output. TermsEnum termsEnum = filter(terms, reader); DocsEnum docsEnum = null; for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { final long termOrd = builder.nextOrdinal(); assert termOrd > 0; fstBuilder.add(Util.toIntsRef(term, scratch), (long) termOrd); docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE); for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { builder.addDoc(docId); } } FST<Long> fst = fstBuilder.finish(); final Ordinals ordinals = builder.build(fieldDataType.getSettings()); data = new FSTBytesAtomicFieldData(fst, ordinals); success = true; return data; } finally { if (success) { estimator.afterLoad(null, data.getMemorySizeInBytes()); } builder.close(); } }
From source file:org.meresco.lucene.search.MerescoTaxonomyFacetCounts.java
License:Open Source License
private final void count(List<MatchingDocs> matchingDocs) throws IOException { IntsRef scratch = new IntsRef(); OrdinalsReader.OrdinalsSegmentReader[] ordsReaders = new OrdinalsReader.OrdinalsSegmentReader[this.ordinalsReaders .size()];/*from w ww . jav a2 s .co m*/ for (MatchingDocs hits : matchingDocs) { for (int i = 0; i < ordsReaders.length; i++) { ordsReaders[i] = this.ordinalsReaders.get(i).getReader(hits.context); } DocIdSetIterator docs = hits.bits.iterator(); int doc; while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { for (OrdinalsReader.OrdinalsSegmentReader ords : ordsReaders) { ords.get(doc, scratch); for (int i = 0; i < scratch.length; i++) { values[scratch.ints[scratch.offset + i]]++; } } } } rollup(); }
From source file:stemmer.Dictionary.java
License:Apache License
private FST<IntsRef> affixFST(TreeMap<String, List<Character>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRef scratch = new IntsRef(); for (Map.Entry<String, List<Character>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Character> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Character c : entries) { output.ints[output.length++] = c; }//from ww w .j ava2 s .co m builder.add(scratch, output); } return builder.finish(); }
From source file:stemmer.Dictionary.java
License:Apache License
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String, String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); }//from w w w. j av a 2s . com if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRef scratchInts = new IntsRef(); for (Map.Entry<String, String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts, new CharsRef(entry.getValue())); } return builder.finish(); }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map * * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file *//*w w w .j a va 2s . c o m*/ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { line = unescapeEntry(line); if (needsInputCleaning) { int flagSep = line.lastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRef scratchLine = new BytesRef(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRef currentOrds = new IntsRef(); String line; while (reader.read(scratchLine)) { line = scratchLine.utf8ToString(); String entry; char wordForm[]; int flagSep = line.lastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; entry = line; } else { // note, there can be comments (morph description) after a flag. // we should really look for any whitespace: currently just tab and space int end = line.indexOf('\t', flagSep); if (end == -1) end = line.length(); int end2 = line.indexOf(' ', flagSep); if (end2 == -1) end2 = line.length(); end = Math.min(end, end2); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts, currentOrds); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRef(); // must be this way } currentOrds.grow(currentOrds.length + 1); currentOrds.ints[currentOrds.length++] = ord; } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts, currentOrds); reader.close(); sorted.delete(); }