List of usage examples for org.apache.lucene.util StringHelper startsWith
public static boolean startsWith(BytesRef ref, BytesRef prefix)
true iff the ref starts with the given prefix. From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java
License:Apache License
private void seekToStartTerm(BytesRef target) throws IOException { assert currentFrame.ord == 0; if (term.length < target.length) { term.bytes = ArrayUtil.grow(term.bytes, target.length); }/*from w ww .jav a 2s .c om*/ FST.Arc<BytesRef> arc = arcs[0]; assert arc == currentFrame.arc; for (int idx = 0; idx <= target.length; idx++) { while (true) { final int savNextEnt = currentFrame.nextEnt; final int savePos = currentFrame.suffixesReader.getPosition(); final int saveStartBytePos = currentFrame.startBytePos; final int saveSuffix = currentFrame.suffix; final long saveLastSubFP = currentFrame.lastSubFP; final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm; final boolean isSubBlock = currentFrame.next(); term.length = currentFrame.prefix + currentFrame.suffix; if (term.bytes.length < term.length) { term.bytes = ArrayUtil.grow(term.bytes, term.length); } System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); if (isSubBlock && StringHelper.startsWith(target, term)) { // Recurse currentFrame = pushFrame(getState()); break; } else { final int cmp = term.compareTo(target); if (cmp < 0) { if (currentFrame.nextEnt == currentFrame.entCount) { if (!currentFrame.isLastInFloor) { // Advance to next floor block currentFrame.loadNextFloorBlock(); continue; } else { return; } } continue; } else if (cmp == 0) { if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) { continue; } return; } else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) { // Fallback to prior entry: the semantics of // this method is that the first call to // next() will return the term after the // requested term currentFrame.nextEnt = savNextEnt; currentFrame.lastSubFP = saveLastSubFP; currentFrame.startBytePos = saveStartBytePos; currentFrame.suffix = saveSuffix; currentFrame.suffixesReader.setPosition(savePos); currentFrame.termState.termBlockOrd = saveTermBlockOrd; currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm; System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix); term.length = currentFrame.prefix + currentFrame.suffix; // If the last entry was a block we don't // need to bother recursing and pushing to // the last term under it because the first // next() will simply skip the frame anyway return; } } } } assert false; }
From source file:org.apache.solr.codecs.onsql.ONSQLUtil.java
License:Apache License
public static void checkFooter(ChecksumIndexInput input) throws IOException { BytesRefBuilder scratch = new BytesRefBuilder(); String expectedChecksum = String.format(Locale.ROOT, "%020d", input.getChecksum()); ONSQLUtil.readLine(input, scratch);//from w w w . j a v a2 s .c o m if (StringHelper.startsWith(scratch.get(), CHECKSUM) == false) { throw new CorruptIndexException("ONSQL failure: expected checksum line but got " + scratch.get().utf8ToString() + " (resource=" + input + ")"); } String actualChecksum = new BytesRef(scratch.bytes(), CHECKSUM.length, scratch.length() - CHECKSUM.length) .utf8ToString(); if (!expectedChecksum.equals(actualChecksum)) { throw new CorruptIndexException("ONSQL checksum failure: " + actualChecksum + " != " + expectedChecksum + " (resource=" + input + ")"); } if (input.length() != input.getFilePointer()) { throw new CorruptIndexException( "Unexpected stuff at the end of file, please be careful with your text editor! (resource=" + input + ")"); } }
From source file:org.apache.solr.handler.component.TermsComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(TermsParams.TERMS, false)) return;/*from w ww . ja v a 2 s . c om*/ String[] fields = params.getParams(TermsParams.TERMS_FIELD); NamedList<Object> termsResult = new SimpleOrderedMap<Object>(); rb.rsp.add("terms", termsResult); if (fields == null || fields.length == 0) return; int limit = params.getInt(TermsParams.TERMS_LIMIT, 10); if (limit < 0) { limit = Integer.MAX_VALUE; } String lowerStr = params.get(TermsParams.TERMS_LOWER); String upperStr = params.get(TermsParams.TERMS_UPPER); boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false); boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true); boolean sort = !TermsParams.TERMS_SORT_INDEX .equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT)); int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); if (freqmax < 0) { freqmax = Integer.MAX_VALUE; } String prefix = params.get(TermsParams.TERMS_PREFIX_STR); String regexp = params.get(TermsParams.TERMS_REGEXP_STR); Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null; boolean raw = params.getBool(TermsParams.TERMS_RAW, false); final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader(); Fields lfields = indexReader.fields(); for (String field : fields) { NamedList<Integer> fieldTerms = new NamedList<Integer>(); termsResult.add(field, fieldTerms); Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field continue; } FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); // prefix must currently be text BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix); BytesRef upperBytes = null; if (upperStr != null) { upperBytes = new BytesRef(); ft.readableToIndexed(upperStr, upperBytes); } BytesRef lowerBytes; if (lowerStr == null) { // If no lower bound was specified, use the prefix lowerBytes = prefixBytes; } else { lowerBytes = new BytesRef(); if (raw) { // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists // perhaps we detect if the FieldType is non-character and expect hex if so? lowerBytes = new BytesRef(lowerStr); } else { lowerBytes = new BytesRef(); ft.readableToIndexed(lowerStr, lowerBytes); } } TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; if (lowerBytes != null) { if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); //Only advance the enum if we are excluding the lower bound and the lower Term actually matches if (lowerIncl == false && term.equals(lowerBytes)) { term = termsEnum.next(); } } } else { // position termsEnum on first term term = termsEnum.next(); } int i = 0; BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null); CharsRef external = new CharsRef(); while (term != null && (i < limit || sort)) { boolean externalized = false; // did we fill in "external" yet for this term? // stop if the prefix doesn't match if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break; if (pattern != null) { // indexed text or external text? // TODO: support "raw" mode? ft.indexedToReadable(term, external); externalized = true; if (!pattern.matcher(external).matches()) { term = termsEnum.next(); continue; } } if (upperBytes != null) { int upperCmp = term.compareTo(upperBytes); // if we are past the upper term, or equal to it (when don't include upper) then stop. if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break; } // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); if (docFreq >= freqmin && docFreq <= freqmax) { // add the term to the list if (sort) { queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); i++; } } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { if (i >= limit) break; ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; } } } }
From source file:org.apache.solr.request.NumericFacets.java
License:Apache License
public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException { final boolean zeros = mincount <= 0; mincount = Math.max(mincount, 1); final SchemaField sf = searcher.getSchema().getField(fieldName); final FieldType ft = sf.getType(); final NumericType numericType = ft.getNumericType(); if (numericType == null) { throw new IllegalStateException(); }//from w w w. ja v a2 s . c o m final List<AtomicReaderContext> leaves = searcher.getIndexReader().leaves(); // 1. accumulate final HashTable hashTable = new HashTable(); final Iterator<AtomicReaderContext> ctxIt = leaves.iterator(); AtomicReaderContext ctx = null; FieldCache.Longs longs = null; Bits docsWithField = null; int missingCount = 0; for (DocIterator docsIt = docs.iterator(); docsIt.hasNext();) { final int doc = docsIt.nextDoc(); if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) { do { ctx = ctxIt.next(); } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()); assert doc >= ctx.docBase; switch (numericType) { case LONG: longs = FieldCache.DEFAULT.getLongs(ctx.reader(), fieldName, true); break; case INT: final FieldCache.Ints ints = FieldCache.DEFAULT.getInts(ctx.reader(), fieldName, true); longs = new FieldCache.Longs() { @Override public long get(int docID) { return ints.get(docID); } }; break; case FLOAT: final FieldCache.Floats floats = FieldCache.DEFAULT.getFloats(ctx.reader(), fieldName, true); longs = new FieldCache.Longs() { @Override public long get(int docID) { return NumericUtils.floatToSortableInt(floats.get(docID)); } }; break; case DOUBLE: final FieldCache.Doubles doubles = FieldCache.DEFAULT.getDoubles(ctx.reader(), fieldName, true); longs = new FieldCache.Longs() { @Override public long get(int docID) { return NumericUtils.doubleToSortableLong(doubles.get(docID)); } }; break; default: throw new AssertionError(); } docsWithField = FieldCache.DEFAULT.getDocsWithField(ctx.reader(), fieldName); } long v = longs.get(doc - ctx.docBase); if (v != 0 || docsWithField.get(doc - ctx.docBase)) { hashTable.add(doc, v, 1); } else { ++missingCount; } } // 2. select top-k facet values final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size); final PriorityQueue<Entry> pq; if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) { pq = new PriorityQueue<Entry>(pqSize) { @Override protected boolean lessThan(Entry a, Entry b) { if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) { return true; } else { return false; } } }; } else { pq = new PriorityQueue<Entry>(pqSize) { @Override protected boolean lessThan(Entry a, Entry b) { return a.bits > b.bits; } }; } Entry e = null; for (int i = 0; i < hashTable.bits.length; ++i) { if (hashTable.counts[i] >= mincount) { if (e == null) { e = new Entry(); } e.bits = hashTable.bits[i]; e.count = hashTable.counts[i]; e.docID = hashTable.docIDs[i]; e = pq.insertWithOverflow(e); } } // 4. build the NamedList final ValueSource vs = ft.getValueSource(sf, null); final NamedList<Integer> result = new NamedList<Integer>(); // This stuff is complicated because if facet.mincount=0, the counts needs // to be merged with terms from the terms dict if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) { // Only keep items we're interested in final Deque<Entry> counts = new ArrayDeque<Entry>(); while (pq.size() > offset) { counts.addFirst(pq.pop()); } // Entries from the PQ first, then using the terms dictionary for (Entry entry : counts) { final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves); final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx)); result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count); } if (zeros && (limit < 0 || result.size() < limit)) { // need to merge with the term dict if (!sf.indexed()) { throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is not indexed"); } // Add zeros until there are limit results final Set<String> alreadySeen = new HashSet<String>(); while (pq.size() > 0) { Entry entry = pq.pop(); final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves); final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx)); alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase)); } for (int i = 0; i < result.size(); ++i) { alreadySeen.add(result.getName(i)); } final Terms terms = searcher.getAtomicReader().terms(fieldName); if (terms != null) { final String prefixStr = TrieField.getMainValuePrefix(ft); final BytesRef prefix; if (prefixStr != null) { prefix = new BytesRef(prefixStr); } else { prefix = new BytesRef(); } final TermsEnum termsEnum = terms.iterator(null); BytesRef term; switch (termsEnum.seekCeil(prefix)) { case FOUND: case NOT_FOUND: term = termsEnum.term(); break; case END: term = null; break; default: throw new AssertionError(); } final CharsRef spare = new CharsRef(); for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix);) { ft.indexedToReadable(term, spare); final String termStr = spare.toString(); if (!alreadySeen.contains(termStr)) { ++skipped; } term = termsEnum.next(); } for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) { ft.indexedToReadable(term, spare); final String termStr = spare.toString(); if (!alreadySeen.contains(termStr)) { result.add(termStr, 0); } } } } } else { // sort=index, mincount=0 and we have less than limit items // => Merge the PQ and the terms dictionary on the fly if (!sf.indexed()) { throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "=" + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed"); } final Map<String, Integer> counts = new HashMap<String, Integer>(); while (pq.size() > 0) { final Entry entry = pq.pop(); final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves); final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx)); counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count); } final Terms terms = searcher.getAtomicReader().terms(fieldName); if (terms != null) { final String prefixStr = TrieField.getMainValuePrefix(ft); final BytesRef prefix; if (prefixStr != null) { prefix = new BytesRef(prefixStr); } else { prefix = new BytesRef(); } final TermsEnum termsEnum = terms.iterator(null); BytesRef term; switch (termsEnum.seekCeil(prefix)) { case FOUND: case NOT_FOUND: term = termsEnum.term(); break; case END: term = null; break; default: throw new AssertionError(); } final CharsRef spare = new CharsRef(); for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) { term = termsEnum.next(); } for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) { ft.indexedToReadable(term, spare); final String termStr = spare.toString(); Integer count = counts.get(termStr); if (count == null) { count = 0; } result.add(termStr, count); } } } if (missing) { result.add(null, missingCount); } return result; }
From source file:org.apache.solr.request.SimpleFacets.java
License:Apache License
/** * Returns a list of terms in the specified field along with the * corresponding count of documents in the set that match that constraint. * This method uses the FilterCache to get the intersection count between <code>docs</code> * and the DocSet for each term in the filter. * * @see FacetParams#FACET_LIMIT// w w w. j a v a 2s . c o m * @see FacetParams#FACET_ZEROS * @see FacetParams#FACET_MISSING */ public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix) throws IOException { /* :TODO: potential optimization... * cache the Terms with the highest docFreq and try them first * don't enum if we get our max from them */ // Minimum term docFreq in order to use the filterCache for that term. int minDfFilterCache = params.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0); // make sure we have a set that is fast for random access, if we will use it for that DocSet fastForRandomSet = docs; if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) { SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } IndexSchema schema = searcher.getSchema(); AtomicReader r = searcher.getAtomicReader(); FieldType ft = schema.getFieldType(field); boolean sortByCount = sort.equals("count") || sort.equals("true"); final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1; final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null; final NamedList<Integer> res = new NamedList<Integer>(); int min = mincount - 1; // the smallest value in the top 'N' values int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; BytesRef startTermBytes = null; if (prefix != null) { String indexedPrefix = ft.toInternal(prefix); startTermBytes = new BytesRef(indexedPrefix); } Fields fields = r.fields(); Terms terms = fields == null ? null : fields.terms(field); TermsEnum termsEnum = null; SolrIndexSearcher.DocsEnumState deState = null; BytesRef term = null; if (terms != null) { termsEnum = terms.iterator(null); // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for // facet.offset when sorting by index order. if (startTermBytes != null) { if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) { termsEnum = null; } else { term = termsEnum.term(); } } else { // position termsEnum on first term term = termsEnum.next(); } } DocsEnum docsEnum = null; CharsRef charsRef = new CharsRef(10); if (docs.size() >= mincount) { while (term != null) { if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) break; int df = termsEnum.docFreq(); // If we are sorting, we can use df>min (rather than >=) since we // are going in index order. For certain term distributions this can // make a large difference (for example, many terms with df=1). if (df > 0 && df > min) { int c; if (df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = r.getLiveDocs(); deState.termsEnum = termsEnum; deState.docsEnum = docsEnum; } c = searcher.numDocs(docs, deState); docsEnum = deState.docsEnum; } else { // iterate over TermDocs to calculate the intersection // TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this? // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl) // TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet? docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE); c = 0; if (docsEnum instanceof MultiDocsEnum) { MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum) docsEnum).getSubs(); int numSubs = ((MultiDocsEnum) docsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiDocsEnum.EnumWithSlice sub = subs[subindex]; if (sub.docsEnum == null) continue; int base = sub.slice.start; int docid; while ((docid = sub.docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } } else { int docid; while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } } if (sortByCount) { if (c > min) { BytesRef termCopy = BytesRef.deepCopyOf(term); queue.add(new CountPair<BytesRef, Integer>(termCopy, c)); if (queue.size() >= maxsize) min = queue.last().val; } } else { if (c >= mincount && --off < 0) { if (--lim < 0) break; ft.indexedToReadable(term, charsRef); res.add(charsRef.toString(), c); } } } term = termsEnum.next(); } } if (sortByCount) { for (CountPair<BytesRef, Integer> p : queue) { if (--off >= 0) continue; if (--lim < 0) break; ft.indexedToReadable(p.key, charsRef); res.add(charsRef.toString(), p.val); } } if (missing) { res.add(null, getFieldMissingCount(searcher, docs, field)); } return res; }
From source file:org.apache.solr.search.facet.FacetFieldProcessorByEnumTermsStream.java
License:Apache License
private SimpleOrderedMap<Object> _nextBucket() throws IOException { DocSet termSet = null;//from w w w .j a v a 2 s .c om try { while (term != null) { if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) { break; } int df = termsEnum.docFreq(); if (df < effectiveMincount) { term = termsEnum.next(); continue; } if (termSet != null) { // termSet.decref(); // OFF-HEAP termSet = null; } int c = 0; if (hasSubFacets || df >= minDfFilterCache) { // use the filter cache if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = sf.getName(); deState.liveDocs = fcontext.searcher.getSlowAtomicReader().getLiveDocs(); deState.termsEnum = termsEnum; deState.postingsEnum = postingsEnum; deState.minSetSizeCached = minDfFilterCache; } if (hasSubFacets || !countOnly) { DocSet termsAll = fcontext.searcher.getDocSet(deState); termSet = docs.intersection(termsAll); // termsAll.decref(); // OFF-HEAP c = termSet.size(); } else { c = fcontext.searcher.numDocs(docs, deState); } postingsEnum = deState.postingsEnum; resetStats(); if (!countOnly) { collect(termSet, 0); } } else { // We don't need the docset here (meaning no sub-facets). // if countOnly, then we are calculating some other stats... resetStats(); // lazy convert to fastForRandomSet if (fastForRandomSet == null) { fastForRandomSet = docs; if (docs instanceof SortedIntDocSet) { // OFF-HEAP todo: also check for native version SortedIntDocSet sset = (SortedIntDocSet) docs; fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); } } // iterate over TermDocs to calculate the intersection postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); if (postingsEnum instanceof MultiPostingsEnum) { MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs(); int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs(); for (int subindex = 0; subindex < numSubs; subindex++) { MultiPostingsEnum.EnumWithSlice sub = subs[subindex]; if (sub.postingsEnum == null) continue; int base = sub.slice.start; int docid; if (countOnly) { while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) c++; } } else { setNextReader(leaves[sub.slice.readerIndex]); while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid + base)) { c++; collect(docid, 0); } } } } } else { int docid; if (countOnly) { while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) c++; } } else { setNextReader(leaves[0]); while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (fastForRandomSet.exists(docid)) { c++; collect(docid, 0); } } } } } if (c < effectiveMincount) { term = termsEnum.next(); continue; } // handle offset and limit if (bucketsToSkip > 0) { bucketsToSkip--; term = termsEnum.next(); continue; } if (freq.limit >= 0 && ++bucketsReturned > freq.limit) { return null; } // set count in case other stats depend on it countAcc.incrementCount(0, c); // OK, we have a good bucket to return... first get bucket value before moving to next term Object bucketVal = sf.getType().toObject(sf, term); TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null; term = termsEnum.next(); SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>(); bucket.add("val", bucketVal); addStats(bucket, 0); if (hasSubFacets) { processSubs(bucket, bucketQuery, termSet); } // TODO... termSet needs to stick around for streaming sub-facets? return bucket; } } finally { if (termSet != null) { // termSet.decref(); // OFF-HEAP termSet = null; } } // end of the iteration return null; }
From source file:org.apache.solr.uninverting.DocTermOrds.java
License:Apache License
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); }/* ww w.ja va 2s. c o m*/ //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (;;) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (;;) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass=" + pass + " process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /*** we don't have to worry about the array getting too large * since the "pos" param will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen <= pos + len) newlen<<=1; // doubling strategy } ****/ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }
From source file:org.apache.solr.uninverting.TestDocTermOrds.java
License:Apache License
public void testRandomWithPrefix() throws Exception { Directory dir = newDirectory();/* ww w. j av a 2s. c o m*/ final Set<String> prefixes = new HashSet<>(); final int numPrefix = TestUtil.nextInt(random(), 2, 7); if (VERBOSE) { System.out.println("TEST: use " + numPrefix + " prefixes"); } while (prefixes.size() < numPrefix) { prefixes.add(TestUtil.randomRealisticUnicodeString(random())); //prefixes.add(_TestUtil.randomSimpleString(random)); } final String[] prefixesArray = prefixes.toArray(new String[prefixes.size()]); final int NUM_TERMS = atLeast(20); final Set<BytesRef> terms = new HashSet<>(); while (terms.size() < NUM_TERMS) { final String s = prefixesArray[random().nextInt(prefixesArray.length)] + TestUtil.randomRealisticUnicodeString(random()); //final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomSimpleString(random); if (s.length() > 0) { terms.add(new BytesRef(s)); } } final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]); Arrays.sort(termsArray); final int NUM_DOCS = atLeast(100); IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); // Sometimes swap in codec that impls ord(): if (random().nextInt(10) == 7) { Codec codec = TestUtil.alwaysPostingsFormat(TestUtil.getPostingsFormatWithOrds(random())); conf.setCodec(codec); } final RandomIndexWriter w = new RandomIndexWriter(random(), dir, conf); final int[][] idToOrds = new int[NUM_DOCS][]; final Set<Integer> ordsForDocSet = new HashSet<>(); for (int id = 0; id < NUM_DOCS; id++) { Document doc = new Document(); doc.add(new LegacyIntField("id", id, Field.Store.YES)); final int termCount = TestUtil.nextInt(random(), 0, 20 * RANDOM_MULTIPLIER); while (ordsForDocSet.size() < termCount) { ordsForDocSet.add(random().nextInt(termsArray.length)); } final int[] ordsForDoc = new int[termCount]; int upto = 0; if (VERBOSE) { System.out.println("TEST: doc id=" + id); } for (int ord : ordsForDocSet) { ordsForDoc[upto++] = ord; Field field = newStringField("field", termsArray[ord].utf8ToString(), Field.Store.NO); if (VERBOSE) { System.out.println(" f=" + termsArray[ord].utf8ToString()); } doc.add(field); } ordsForDocSet.clear(); Arrays.sort(ordsForDoc); idToOrds[id] = ordsForDoc; w.addDocument(doc); } final DirectoryReader r = w.getReader(); w.close(); if (VERBOSE) { System.out.println("TEST: reader=" + r); } LeafReader slowR = SlowCompositeReaderWrapper.wrap(r); TestUtil.checkReader(slowR); for (String prefix : prefixesArray) { final BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix); final int[][] idToOrdsPrefix = new int[NUM_DOCS][]; for (int id = 0; id < NUM_DOCS; id++) { final int[] docOrds = idToOrds[id]; final List<Integer> newOrds = new ArrayList<>(); for (int ord : idToOrds[id]) { if (StringHelper.startsWith(termsArray[ord], prefixRef)) { newOrds.add(ord); } } final int[] newOrdsArray = new int[newOrds.size()]; int upto = 0; for (int ord : newOrds) { newOrdsArray[upto++] = ord; } idToOrdsPrefix[id] = newOrdsArray; } for (LeafReaderContext ctx : r.leaves()) { if (VERBOSE) { System.out.println("\nTEST: sub=" + ctx.reader()); } verify(ctx.reader(), idToOrdsPrefix, termsArray, prefixRef); } // Also test top-level reader: its enum does not support // ord, so this forces the OrdWrapper to run: if (VERBOSE) { System.out.println("TEST: top reader"); } verify(slowR, idToOrdsPrefix, termsArray, prefixRef); } FieldCache.DEFAULT.purgeByCacheKey(slowR.getCoreCacheKey()); r.close(); dir.close(); }
From source file:org.apache.solr.uninverting.TestDocTermOrds.java
License:Apache License
private void verify(LeafReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception { final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE, TestUtil.nextInt(random(), 2, 10)); final NumericDocValues docIDToID = FieldCache.DEFAULT.getNumerics(r, "id", FieldCache.LEGACY_INT_PARSER); /*/*from w ww. j a v a2 s. c o m*/ for(int docID=0;docID<subR.maxDoc();docID++) { System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); } */ if (VERBOSE) { System.out.println("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(); int ord = 0; while (allTE.next() != null) { System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString()); } } //final TermsEnum te = subR.fields().terms("field").iterator(); final TermsEnum te = dto.getOrdTermsEnum(r); if (dto.numTerms() == 0) { if (prefixRef == null) { assertNull(MultiFields.getTerms(r, "field")); } else { Terms terms = MultiFields.getTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(); TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { assertFalse( "term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef)); } else { // ok } } else { // ok } } return; } if (VERBOSE) { System.out.println("TEST: TERMS:"); te.seekExact(0); while (true) { System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString()); if (te.next() == null) { break; } } } SortedSetDocValues iter = dto.iterator(r); for (int docID = 0; docID < r.maxDoc(); docID++) { assertEquals(docID, docIDToID.nextDoc()); if (docID > iter.docID()) { iter.nextDoc(); } if (docID < iter.docID()) { int[] answers = idToOrds[(int) docIDToID.longValue()]; assertEquals(0, answers.length); continue; } if (VERBOSE) { System.out.println( "TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.longValue() + ")"); } final int[] answers = idToOrds[(int) docIDToID.longValue()]; int upto = 0; long ord; while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.seekExact(ord); final BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { System.out.println(" exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString()); } assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + ord, expected, te.term()); } assertEquals(answers.length, upto); } }
From source file:org.buzzinate.lezhi.query.LezhiTermsEnum.java
License:Apache License
protected AcceptStatus accept(BytesRef term) throws IOException { System.out.println(term.utf8ToString() + ", docfreq=" + docFreq()); if (StringHelper.startsWith(term, prefixRef)) { return AcceptStatus.YES; } else {//from w w w.java 2 s . com return AcceptStatus.END; } }