List of usage examples for org.apache.lucene.util BytesRef compareTo
@Override public int compareTo(BytesRef other)
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();//from www. j a va2 s .c om MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.RocanaSegmentTermsEnum.java
License:Apache License
@Override public void seekExact(BytesRef target, TermState otherState) { // if (DEBUG) { // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + target.utf8ToString() + " " + target + " state=" + otherState); // }// ww w .j a va 2 s .com assert clearEOF(); if (target.compareTo(term.get()) != 0 || !termExists) { assert otherState != null && otherState instanceof BlockTermState; currentFrame = staticFrame; currentFrame.state.copyFrom(otherState); term.copyBytes(target); currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); assert currentFrame.metaDataUpto > 0; validIndexPrefix = 0; } else { // if (DEBUG) { // System.out.println(" skip seek: already on target state=" + currentFrame.state); // } } }
From source file:elhuyar.bilakit.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map *//www .j av a 2 s. c om * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); reader.close(); sorted.delete(); }
From source file:hunspell_stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map */*from w w w . j a v a 2 s .c om*/ * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); StringBuilder sb = new StringBuilder(); Path unsorted = Files.createTempFile(tempDir, "unsorted", "dat"); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { // wild and unpredictable code comment rules if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') { continue; } line = unescapeEntry(line); // if we havent seen any stem exceptions, try to parse one if (hasStemExceptions == false) { int morphStart = line.indexOf(MORPH_SEPARATOR); if (morphStart >= 0 && morphStart < line.length()) { hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null; } } if (needsInputCleaning) { int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { flagSep = line.indexOf(MORPH_SEPARATOR); } if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } } Path sorted = Files.createTempFile(tempDir, "sorted", "dat"); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); boolean success = false; try { sorter.sort(unsorted, sorted); success = true; } finally { if (success) { Files.delete(unsorted); } else { IOUtils.deleteFilesIgnoringExceptions(unsorted); } } boolean success2 = false; ByteSequencesReader reader = new ByteSequencesReader(sorted); try { BytesRefBuilder scratchLine = new BytesRefBuilder(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRefBuilder currentOrds = new IntsRefBuilder(); String line; while (reader.read(scratchLine)) { line = scratchLine.get().utf8ToString(); String entry; char wordForm[]; int end; int flagSep = line.indexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; end = line.indexOf(MORPH_SEPARATOR); entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } // we possibly have morphological data int stemExceptionID = 0; if (hasStemExceptions && end + 1 < line.length()) { String stemException = parseStemException(line.substring(end + 1)); if (stemException != null) { if (stemExceptionCount == stemExceptions.length) { int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); stemExceptions = Arrays.copyOf(stemExceptions, newSize); } stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form stemExceptions[stemExceptionCount++] = stemException; } } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRefBuilder(); // must be this way } if (hasStemExceptions) { currentOrds.append(ord); currentOrds.append(stemExceptionID); } else { currentOrds.append(ord); } } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); success2 = true; } finally { IOUtils.closeWhileHandlingException(reader); if (success2) { Files.delete(sorted); } else { IOUtils.deleteFilesIgnoringExceptions(sorted); } } }
From source file:org.apache.atlas.catalog.query.TermRangeQueryExpression.java
License:Apache License
private boolean compareLowerBound(BytesRef valueBytes) { return m_lowerTerm == null || (m_lowerInclusive ? valueBytes.compareTo(m_lowerTerm) > 0 : valueBytes.compareTo(m_lowerTerm) >= 0); }
From source file:org.apache.atlas.catalog.query.TermRangeQueryExpression.java
License:Apache License
private boolean compareUpperBound(BytesRef valueBytes) { return m_upperTerm == null || (m_upperInclusive ? valueBytes.compareTo(m_upperTerm) < 0 : valueBytes.compareTo(m_upperTerm) <= 0); }
From source file:org.apache.solr.handler.component.TermsComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return;/*from w ww . java 2 s. c o m*/ String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<Object>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX .equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<Integer>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); //Only advance the enum if we are excluding the lower bound and the lower Term actually matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
From source file:org.apache.solr.request.PerSegmentSingleValuedFaceting.java
License:Apache License
NamedList<Integer> getFacetCounts(Executor executor) throws IOException { CompletionService<SegFacet> completionService = new ExecutorCompletionService<SegFacet>(executor); // reuse the translation logic to go from top level set to per-segment set baseSet = docs.getTopFilter();/*from w w w . j a v a 2 s . c o m*/ final List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves(); // The list of pending tasks that aren't immediately submitted // TODO: Is there a completion service, or a delegating executor that can // limit the number of concurrent tasks submitted to a bigger executor? LinkedList<Callable<SegFacet>> pending = new LinkedList<Callable<SegFacet>>(); int threads = nThreads <= 0 ? Integer.MAX_VALUE : nThreads; for (final AtomicReaderContext leave : leaves) { final SegFacet segFacet = new SegFacet(leave); Callable<SegFacet> task = new Callable<SegFacet>() { @Override public SegFacet call() throws Exception { segFacet.countTerms(); return segFacet; } }; // TODO: if limiting threads, submit by largest segment first? if (--threads >= 0) { completionService.submit(task); } else { pending.add(task); } } // now merge the per-segment results PriorityQueue<SegFacet> queue = new PriorityQueue<SegFacet>(leaves.size()) { @Override protected boolean lessThan(SegFacet a, SegFacet b) { return a.tempBR.compareTo(b.tempBR) < 0; } }; boolean hasMissingCount = false; int missingCount = 0; for (int i = 0, c = leaves.size(); i < c; i++) { SegFacet seg = null; try { Future<SegFacet> future = completionService.take(); seg = future.get(); if (!pending.isEmpty()) { completionService.submit(pending.removeFirst()); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } catch (ExecutionException e) { Throwable cause = e.getCause(); if (cause instanceof RuntimeException) { throw (RuntimeException) cause; } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error in per-segment faceting on field: " + fieldName, cause); } } if (seg.startTermIndex < seg.endTermIndex) { if (seg.startTermIndex == -1) { hasMissingCount = true; missingCount += seg.counts[0]; seg.pos = 0; } else { seg.pos = seg.startTermIndex; } if (seg.pos < seg.endTermIndex) { seg.tenum = seg.si.termsEnum(); seg.tenum.seekExact(seg.pos); seg.tempBR = seg.tenum.term(); queue.add(seg); } } } FacetCollector collector; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { collector = new CountSortedFacetCollector(offset, limit, mincount); } else { collector = new IndexSortedFacetCollector(offset, limit, mincount); } BytesRef val = new BytesRef(); while (queue.size() > 0) { SegFacet seg = queue.top(); // we will normally end up advancing the term enum for this segment // while still using "val", so we need to make a copy since the BytesRef // may be shared across calls. val.copyBytes(seg.tempBR); int count = 0; do { count += seg.counts[seg.pos - seg.startTermIndex]; // TODO: OPTIMIZATION... // if mincount>0 then seg.pos++ can skip ahead to the next non-zero entry. seg.pos++; if (seg.pos >= seg.endTermIndex) { queue.pop(); seg = queue.top(); } else { seg.tempBR = seg.tenum.next(); seg = queue.updateTop(); } } while (seg != null && val.compareTo(seg.tempBR) == 0); boolean stop = collector.collect(val, count); if (stop) break; } NamedList<Integer> res = collector.getFacetCounts(); // convert labels to readable form FieldType ft = searcher.getSchema().getFieldType(fieldName); int sz = res.size(); for (int i = 0; i < sz; i++) { res.setName(i, ft.indexedToReadable(res.getName(i))); } if (missing) { if (!hasMissingCount) { missingCount = SimpleFacets.getFieldMissingCount(searcher, docs, fieldName); } res.add(null, missingCount); } return res; }
From source file:org.apache.solr.uninverting.TestFieldCacheSortRandom.java
License:Apache License
private void testRandomStringSort(SortField.Type type) throws Exception { Random random = new Random(random().nextLong()); final int NUM_DOCS = atLeast(100); final Directory dir = newDirectory(); final RandomIndexWriter writer = new RandomIndexWriter(random, dir); final boolean allowDups = random.nextBoolean(); final Set<String> seen = new HashSet<>(); final int maxLength = TestUtil.nextInt(random, 5, 100); if (VERBOSE) { System.out/* w w w .j av a 2 s . c om*/ .println("TEST: NUM_DOCS=" + NUM_DOCS + " maxLength=" + maxLength + " allowDups=" + allowDups); } int numDocs = 0; final List<BytesRef> docValues = new ArrayList<>(); // TODO: deletions while (numDocs < NUM_DOCS) { final Document doc = new Document(); // 10% of the time, the document is missing the value: final BytesRef br; if (random().nextInt(10) != 7) { final String s; if (random.nextBoolean()) { s = TestUtil.randomSimpleString(random, maxLength); } else { s = TestUtil.randomUnicodeString(random, maxLength); } if (!allowDups) { if (seen.contains(s)) { continue; } seen.add(s); } if (VERBOSE) { System.out.println(" " + numDocs + ": s=" + s); } doc.add(new StringField("stringdv", s, Field.Store.NO)); docValues.add(new BytesRef(s)); } else { br = null; if (VERBOSE) { System.out.println(" " + numDocs + ": <missing>"); } docValues.add(null); } doc.add(new IntPoint("id", numDocs)); doc.add(new StoredField("id", numDocs)); writer.addDocument(doc); numDocs++; if (random.nextInt(40) == 17) { // force flush writer.getReader().close(); } } Map<String, UninvertingReader.Type> mapping = new HashMap<>(); mapping.put("stringdv", Type.SORTED); mapping.put("id", Type.INTEGER_POINT); final IndexReader r = UninvertingReader.wrap(writer.getReader(), mapping); writer.close(); if (VERBOSE) { System.out.println(" reader=" + r); } final IndexSearcher s = newSearcher(r, false); final int ITERS = atLeast(100); for (int iter = 0; iter < ITERS; iter++) { final boolean reverse = random.nextBoolean(); final TopFieldDocs hits; final SortField sf; final boolean sortMissingLast; final boolean missingIsNull; sf = new SortField("stringdv", type, reverse); sortMissingLast = random().nextBoolean(); missingIsNull = true; if (sortMissingLast) { sf.setMissingValue(SortField.STRING_LAST); } final Sort sort; if (random.nextBoolean()) { sort = new Sort(sf); } else { sort = new Sort(sf, SortField.FIELD_DOC); } final int hitCount = TestUtil.nextInt(random, 1, r.maxDoc() + 20); final RandomQuery f = new RandomQuery(random.nextLong(), random.nextFloat(), docValues); int queryType = random.nextInt(2); if (queryType == 0) { hits = s.search(new ConstantScoreQuery(f), hitCount, sort, random.nextBoolean(), random.nextBoolean()); } else { hits = s.search(f, hitCount, sort, random.nextBoolean(), random.nextBoolean()); } if (VERBOSE) { System.out.println("\nTEST: iter=" + iter + " " + hits.totalHits + " hits; topN=" + hitCount + "; reverse=" + reverse + "; sortMissingLast=" + sortMissingLast + " sort=" + sort); } // Compute expected results: Collections.sort(f.matchValues, new Comparator<BytesRef>() { @Override public int compare(BytesRef a, BytesRef b) { if (a == null) { if (b == null) { return 0; } if (sortMissingLast) { return 1; } else { return -1; } } else if (b == null) { if (sortMissingLast) { return -1; } else { return 1; } } else { return a.compareTo(b); } } }); if (reverse) { Collections.reverse(f.matchValues); } final List<BytesRef> expected = f.matchValues; if (VERBOSE) { System.out.println(" expected:"); for (int idx = 0; idx < expected.size(); idx++) { BytesRef br = expected.get(idx); if (br == null && missingIsNull == false) { br = new BytesRef(); } System.out.println(" " + idx + ": " + (br == null ? "<missing>" : br.utf8ToString())); if (idx == hitCount - 1) { break; } } } if (VERBOSE) { System.out.println(" actual:"); for (int hitIDX = 0; hitIDX < hits.scoreDocs.length; hitIDX++) { final FieldDoc fd = (FieldDoc) hits.scoreDocs[hitIDX]; BytesRef br = (BytesRef) fd.fields[0]; System.out.println(" " + hitIDX + ": " + (br == null ? "<missing>" : br.utf8ToString()) + " id=" + s.doc(fd.doc).get("id")); } } for (int hitIDX = 0; hitIDX < hits.scoreDocs.length; hitIDX++) { final FieldDoc fd = (FieldDoc) hits.scoreDocs[hitIDX]; BytesRef br = expected.get(hitIDX); if (br == null && missingIsNull == false) { br = new BytesRef(); } // Normally, the old codecs (that don't support // docsWithField via doc values) will always return // an empty BytesRef for the missing case; however, // if all docs in a given segment were missing, in // that case it will return null! So we must map // null here, too: BytesRef br2 = (BytesRef) fd.fields[0]; if (br2 == null && missingIsNull == false) { br2 = new BytesRef(); } assertEquals(br, br2); } } r.close(); dir.close(); }
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.range.BinaryRangeAggregator.java
License:Apache License
private static int compare(BytesRef a, BytesRef b, int m) { return a == null ? b == null ? 0 : -m : b == null ? m : a.compareTo(b); }