Example usage for org.apache.lucene.util StringHelper startsWith

List of usage examples for org.apache.lucene.util StringHelper startsWith

Introduction

In this page you can find the example usage for org.apache.lucene.util StringHelper startsWith.

Prototype

public static boolean startsWith(BytesRef ref, BytesRef prefix) 

Source Link

Document

Returns true iff the ref starts with the given prefix.

Usage

From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java

License:Apache License

private void seekToStartTerm(BytesRef target) throws IOException {
    assert currentFrame.ord == 0;
    if (term.length < target.length) {
        term.bytes = ArrayUtil.grow(term.bytes, target.length);
    }/*from  w  ww .jav a  2s  .c  om*/
    FST.Arc<BytesRef> arc = arcs[0];
    assert arc == currentFrame.arc;

    for (int idx = 0; idx <= target.length; idx++) {

        while (true) {
            final int savNextEnt = currentFrame.nextEnt;
            final int savePos = currentFrame.suffixesReader.getPosition();
            final int saveStartBytePos = currentFrame.startBytePos;
            final int saveSuffix = currentFrame.suffix;
            final long saveLastSubFP = currentFrame.lastSubFP;
            final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;
            final boolean saveIsAutoPrefixTerm = currentFrame.isAutoPrefixTerm;

            final boolean isSubBlock = currentFrame.next();

            term.length = currentFrame.prefix + currentFrame.suffix;
            if (term.bytes.length < term.length) {
                term.bytes = ArrayUtil.grow(term.bytes, term.length);
            }
            System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes,
                    currentFrame.prefix, currentFrame.suffix);

            if (isSubBlock && StringHelper.startsWith(target, term)) {
                // Recurse
                currentFrame = pushFrame(getState());
                break;
            } else {
                final int cmp = term.compareTo(target);
                if (cmp < 0) {
                    if (currentFrame.nextEnt == currentFrame.entCount) {
                        if (!currentFrame.isLastInFloor) {
                            // Advance to next floor block
                            currentFrame.loadNextFloorBlock();
                            continue;
                        } else {
                            return;
                        }
                    }
                    continue;
                } else if (cmp == 0) {
                    if (allowAutoPrefixTerms == false && currentFrame.isAutoPrefixTerm) {
                        continue;
                    }
                    return;
                } else if (allowAutoPrefixTerms || currentFrame.isAutoPrefixTerm == false) {
                    // Fallback to prior entry: the semantics of
                    // this method is that the first call to
                    // next() will return the term after the
                    // requested term
                    currentFrame.nextEnt = savNextEnt;
                    currentFrame.lastSubFP = saveLastSubFP;
                    currentFrame.startBytePos = saveStartBytePos;
                    currentFrame.suffix = saveSuffix;
                    currentFrame.suffixesReader.setPosition(savePos);
                    currentFrame.termState.termBlockOrd = saveTermBlockOrd;
                    currentFrame.isAutoPrefixTerm = saveIsAutoPrefixTerm;
                    System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes,
                            currentFrame.prefix, currentFrame.suffix);
                    term.length = currentFrame.prefix + currentFrame.suffix;
                    // If the last entry was a block we don't
                    // need to bother recursing and pushing to
                    // the last term under it because the first
                    // next() will simply skip the frame anyway
                    return;
                }
            }
        }
    }

    assert false;
}

From source file:org.apache.solr.codecs.onsql.ONSQLUtil.java

License:Apache License

public static void checkFooter(ChecksumIndexInput input) throws IOException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    String expectedChecksum = String.format(Locale.ROOT, "%020d", input.getChecksum());
    ONSQLUtil.readLine(input, scratch);//from  w w w .  j a  v a2  s .c  o m
    if (StringHelper.startsWith(scratch.get(), CHECKSUM) == false) {
        throw new CorruptIndexException("ONSQL failure: expected checksum line but got "
                + scratch.get().utf8ToString() + " (resource=" + input + ")");
    }
    String actualChecksum = new BytesRef(scratch.bytes(), CHECKSUM.length, scratch.length() - CHECKSUM.length)
            .utf8ToString();
    if (!expectedChecksum.equals(actualChecksum)) {
        throw new CorruptIndexException("ONSQL checksum failure: " + actualChecksum + " != " + expectedChecksum
                + " (resource=" + input + ")");
    }
    if (input.length() != input.getFilePointer()) {
        throw new CorruptIndexException(
                "Unexpected stuff at the end of file, please be careful with your text editor! (resource="
                        + input + ")");
    }
}

From source file:org.apache.solr.handler.component.TermsComponent.java

License:Apache License

@Override
public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false))
        return;/*from w ww  .  ja v a 2 s  . c om*/

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<Object>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0)
        return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
        limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort = !TermsParams.TERMS_SORT_INDEX
            .equals(params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
        freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
        NamedList<Integer> fieldTerms = new NamedList<Integer>();
        termsResult.add(field, fieldTerms);

        Terms terms = lfields == null ? null : lfields.terms(field);
        if (terms == null) {
            // no terms for this field
            continue;
        }

        FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
        if (ft == null)
            ft = new StrField();

        // prefix must currently be text
        BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

        BytesRef upperBytes = null;
        if (upperStr != null) {
            upperBytes = new BytesRef();
            ft.readableToIndexed(upperStr, upperBytes);
        }

        BytesRef lowerBytes;
        if (lowerStr == null) {
            // If no lower bound was specified, use the prefix
            lowerBytes = prefixBytes;
        } else {
            lowerBytes = new BytesRef();
            if (raw) {
                // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
                // perhaps we detect if the FieldType is non-character and expect hex if so?
                lowerBytes = new BytesRef(lowerStr);
            } else {
                lowerBytes = new BytesRef();
                ft.readableToIndexed(lowerStr, lowerBytes);
            }
        }

        TermsEnum termsEnum = terms.iterator(null);
        BytesRef term = null;

        if (lowerBytes != null) {
            if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
                //Only advance the enum if we are excluding the lower bound and the lower Term actually matches
                if (lowerIncl == false && term.equals(lowerBytes)) {
                    term = termsEnum.next();
                }
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }

        int i = 0;
        BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort
                ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit)
                : null);
        CharsRef external = new CharsRef();
        while (term != null && (i < limit || sort)) {
            boolean externalized = false; // did we fill in "external" yet for this term?

            // stop if the prefix doesn't match
            if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes))
                break;

            if (pattern != null) {
                // indexed text or external text?
                // TODO: support "raw" mode?
                ft.indexedToReadable(term, external);
                externalized = true;
                if (!pattern.matcher(external).matches()) {
                    term = termsEnum.next();
                    continue;
                }
            }

            if (upperBytes != null) {
                int upperCmp = term.compareTo(upperBytes);
                // if we are past the upper term, or equal to it (when don't include upper) then stop.
                if (upperCmp > 0 || (upperCmp == 0 && !upperIncl))
                    break;
            }

            // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
            int docFreq = termsEnum.docFreq();
            if (docFreq >= freqmin && docFreq <= freqmax) {
                // add the term to the list
                if (sort) {
                    queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq));
                } else {

                    // TODO: handle raw somehow
                    if (!externalized) {
                        ft.indexedToReadable(term, external);
                    }
                    fieldTerms.add(external.toString(), docFreq);
                    i++;
                }
            }

            term = termsEnum.next();
        }

        if (sort) {
            for (CountPair<BytesRef, Integer> item : queue) {
                if (i >= limit)
                    break;
                ft.indexedToReadable(item.key, external);
                fieldTerms.add(external.toString(), item.val);
                i++;
            }
        }
    }
}

From source file:org.apache.solr.request.NumericFacets.java

License:Apache License

public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName,
        int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
    final boolean zeros = mincount <= 0;
    mincount = Math.max(mincount, 1);
    final SchemaField sf = searcher.getSchema().getField(fieldName);
    final FieldType ft = sf.getType();
    final NumericType numericType = ft.getNumericType();
    if (numericType == null) {
        throw new IllegalStateException();
    }//from w w w. ja  v  a2  s . c  o  m
    final List<AtomicReaderContext> leaves = searcher.getIndexReader().leaves();

    // 1. accumulate
    final HashTable hashTable = new HashTable();
    final Iterator<AtomicReaderContext> ctxIt = leaves.iterator();
    AtomicReaderContext ctx = null;
    FieldCache.Longs longs = null;
    Bits docsWithField = null;
    int missingCount = 0;
    for (DocIterator docsIt = docs.iterator(); docsIt.hasNext();) {
        final int doc = docsIt.nextDoc();
        if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
            do {
                ctx = ctxIt.next();
            } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
            assert doc >= ctx.docBase;
            switch (numericType) {
            case LONG:
                longs = FieldCache.DEFAULT.getLongs(ctx.reader(), fieldName, true);
                break;
            case INT:
                final FieldCache.Ints ints = FieldCache.DEFAULT.getInts(ctx.reader(), fieldName, true);
                longs = new FieldCache.Longs() {
                    @Override
                    public long get(int docID) {
                        return ints.get(docID);
                    }
                };
                break;
            case FLOAT:
                final FieldCache.Floats floats = FieldCache.DEFAULT.getFloats(ctx.reader(), fieldName, true);
                longs = new FieldCache.Longs() {
                    @Override
                    public long get(int docID) {
                        return NumericUtils.floatToSortableInt(floats.get(docID));
                    }
                };
                break;
            case DOUBLE:
                final FieldCache.Doubles doubles = FieldCache.DEFAULT.getDoubles(ctx.reader(), fieldName, true);
                longs = new FieldCache.Longs() {
                    @Override
                    public long get(int docID) {
                        return NumericUtils.doubleToSortableLong(doubles.get(docID));
                    }
                };
                break;
            default:
                throw new AssertionError();
            }
            docsWithField = FieldCache.DEFAULT.getDocsWithField(ctx.reader(), fieldName);
        }
        long v = longs.get(doc - ctx.docBase);
        if (v != 0 || docsWithField.get(doc - ctx.docBase)) {
            hashTable.add(doc, v, 1);
        } else {
            ++missingCount;
        }
    }

    // 2. select top-k facet values
    final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
    final PriorityQueue<Entry> pq;
    if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
        pq = new PriorityQueue<Entry>(pqSize) {
            @Override
            protected boolean lessThan(Entry a, Entry b) {
                if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
                    return true;
                } else {
                    return false;
                }
            }
        };
    } else {
        pq = new PriorityQueue<Entry>(pqSize) {
            @Override
            protected boolean lessThan(Entry a, Entry b) {
                return a.bits > b.bits;
            }
        };
    }
    Entry e = null;
    for (int i = 0; i < hashTable.bits.length; ++i) {
        if (hashTable.counts[i] >= mincount) {
            if (e == null) {
                e = new Entry();
            }
            e.bits = hashTable.bits[i];
            e.count = hashTable.counts[i];
            e.docID = hashTable.docIDs[i];
            e = pq.insertWithOverflow(e);
        }
    }

    // 4. build the NamedList
    final ValueSource vs = ft.getValueSource(sf, null);
    final NamedList<Integer> result = new NamedList<Integer>();

    // This stuff is complicated because if facet.mincount=0, the counts needs
    // to be merged with terms from the terms dict
    if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort)
            || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
        // Only keep items we're interested in
        final Deque<Entry> counts = new ArrayDeque<Entry>();
        while (pq.size() > offset) {
            counts.addFirst(pq.pop());
        }

        // Entries from the PQ first, then using the terms dictionary
        for (Entry entry : counts) {
            final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
            final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
            result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
        }

        if (zeros && (limit < 0 || result.size() < limit)) { // need to merge with the term dict
            if (!sf.indexed()) {
                throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field "
                        + sf.getName() + " which is not indexed");
            }
            // Add zeros until there are limit results
            final Set<String> alreadySeen = new HashSet<String>();
            while (pq.size() > 0) {
                Entry entry = pq.pop();
                final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
                final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
                alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase));
            }
            for (int i = 0; i < result.size(); ++i) {
                alreadySeen.add(result.getName(i));
            }
            final Terms terms = searcher.getAtomicReader().terms(fieldName);
            if (terms != null) {
                final String prefixStr = TrieField.getMainValuePrefix(ft);
                final BytesRef prefix;
                if (prefixStr != null) {
                    prefix = new BytesRef(prefixStr);
                } else {
                    prefix = new BytesRef();
                }
                final TermsEnum termsEnum = terms.iterator(null);
                BytesRef term;
                switch (termsEnum.seekCeil(prefix)) {
                case FOUND:
                case NOT_FOUND:
                    term = termsEnum.term();
                    break;
                case END:
                    term = null;
                    break;
                default:
                    throw new AssertionError();
                }
                final CharsRef spare = new CharsRef();
                for (int skipped = hashTable.size; skipped < offset && term != null
                        && StringHelper.startsWith(term, prefix);) {
                    ft.indexedToReadable(term, spare);
                    final String termStr = spare.toString();
                    if (!alreadySeen.contains(termStr)) {
                        ++skipped;
                    }
                    term = termsEnum.next();
                }
                for (; term != null && StringHelper.startsWith(term, prefix)
                        && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
                    ft.indexedToReadable(term, spare);
                    final String termStr = spare.toString();
                    if (!alreadySeen.contains(termStr)) {
                        result.add(termStr, 0);
                    }
                }
            }
        }
    } else {
        // sort=index, mincount=0 and we have less than limit items
        // => Merge the PQ and the terms dictionary on the fly
        if (!sf.indexed()) {
            throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "="
                    + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed");
        }
        final Map<String, Integer> counts = new HashMap<String, Integer>();
        while (pq.size() > 0) {
            final Entry entry = pq.pop();
            final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
            final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
            counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
        }
        final Terms terms = searcher.getAtomicReader().terms(fieldName);
        if (terms != null) {
            final String prefixStr = TrieField.getMainValuePrefix(ft);
            final BytesRef prefix;
            if (prefixStr != null) {
                prefix = new BytesRef(prefixStr);
            } else {
                prefix = new BytesRef();
            }
            final TermsEnum termsEnum = terms.iterator(null);
            BytesRef term;
            switch (termsEnum.seekCeil(prefix)) {
            case FOUND:
            case NOT_FOUND:
                term = termsEnum.term();
                break;
            case END:
                term = null;
                break;
            default:
                throw new AssertionError();
            }
            final CharsRef spare = new CharsRef();
            for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
                term = termsEnum.next();
            }
            for (; term != null && StringHelper.startsWith(term, prefix)
                    && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
                ft.indexedToReadable(term, spare);
                final String termStr = spare.toString();
                Integer count = counts.get(termStr);
                if (count == null) {
                    count = 0;
                }
                result.add(termStr, count);
            }
        }
    }

    if (missing) {
        result.add(null, missingCount);
    }
    return result;
}

From source file:org.apache.solr.request.SimpleFacets.java

License:Apache License

/**
 * Returns a list of terms in the specified field along with the 
 * corresponding count of documents in the set that match that constraint.
 * This method uses the FilterCache to get the intersection count between <code>docs</code>
 * and the DocSet for each term in the filter.
 *
 * @see FacetParams#FACET_LIMIT//  w w  w. j  a v a 2s . c o  m
 * @see FacetParams#FACET_ZEROS
 * @see FacetParams#FACET_MISSING
 */
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field,
        int offset, int limit, int mincount, boolean missing, String sort, String prefix) throws IOException {

    /* :TODO: potential optimization...
    * cache the Terms with the highest docFreq and try them first
    * don't enum if we get our max from them
    */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = params.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
        SortedIntDocSet sset = (SortedIntDocSet) docs;
        fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    AtomicReader r = searcher.getAtomicReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount
            ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize)
            : null;
    final NamedList<Integer> res = new NamedList<Integer>();

    int min = mincount - 1; // the smallest value in the top 'N' values    
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef startTermBytes = null;
    if (prefix != null) {
        String indexedPrefix = ft.toInternal(prefix);
        startTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
        termsEnum = terms.iterator(null);

        // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
        // facet.offset when sorting by index order.

        if (startTermBytes != null) {
            if (termsEnum.seekCeil(startTermBytes) == TermsEnum.SeekStatus.END) {
                termsEnum = null;
            } else {
                term = termsEnum.term();
            }
        } else {
            // position termsEnum on first term
            term = termsEnum.next();
        }
    }

    DocsEnum docsEnum = null;
    CharsRef charsRef = new CharsRef(10);

    if (docs.size() >= mincount) {
        while (term != null) {

            if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes))
                break;

            int df = termsEnum.docFreq();

            // If we are sorting, we can use df>min (rather than >=) since we
            // are going in index order.  For certain term distributions this can
            // make a large difference (for example, many terms with df=1).
            if (df > 0 && df > min) {
                int c;

                if (df >= minDfFilterCache) {
                    // use the filter cache

                    if (deState == null) {
                        deState = new SolrIndexSearcher.DocsEnumState();
                        deState.fieldName = field;
                        deState.liveDocs = r.getLiveDocs();
                        deState.termsEnum = termsEnum;
                        deState.docsEnum = docsEnum;
                    }

                    c = searcher.numDocs(docs, deState);

                    docsEnum = deState.docsEnum;
                } else {
                    // iterate over TermDocs to calculate the intersection

                    // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it matter for this?
                    // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
                    // TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
                    docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
                    c = 0;

                    if (docsEnum instanceof MultiDocsEnum) {
                        MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum) docsEnum).getSubs();
                        int numSubs = ((MultiDocsEnum) docsEnum).getNumSubs();
                        for (int subindex = 0; subindex < numSubs; subindex++) {
                            MultiDocsEnum.EnumWithSlice sub = subs[subindex];
                            if (sub.docsEnum == null)
                                continue;
                            int base = sub.slice.start;
                            int docid;
                            while ((docid = sub.docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid + base))
                                    c++;
                            }
                        }
                    } else {
                        int docid;
                        while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                            if (fastForRandomSet.exists(docid))
                                c++;
                        }
                    }

                }

                if (sortByCount) {
                    if (c > min) {
                        BytesRef termCopy = BytesRef.deepCopyOf(term);
                        queue.add(new CountPair<BytesRef, Integer>(termCopy, c));
                        if (queue.size() >= maxsize)
                            min = queue.last().val;
                    }
                } else {
                    if (c >= mincount && --off < 0) {
                        if (--lim < 0)
                            break;
                        ft.indexedToReadable(term, charsRef);
                        res.add(charsRef.toString(), c);
                    }
                }
            }

            term = termsEnum.next();
        }
    }

    if (sortByCount) {
        for (CountPair<BytesRef, Integer> p : queue) {
            if (--off >= 0)
                continue;
            if (--lim < 0)
                break;
            ft.indexedToReadable(p.key, charsRef);
            res.add(charsRef.toString(), p.val);
        }
    }

    if (missing) {
        res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
}

From source file:org.apache.solr.search.facet.FacetFieldProcessorByEnumTermsStream.java

License:Apache License

private SimpleOrderedMap<Object> _nextBucket() throws IOException {
    DocSet termSet = null;//from  w w w .j a v a 2  s .c om

    try {
        while (term != null) {

            if (startTermBytes != null && !StringHelper.startsWith(term, startTermBytes)) {
                break;
            }

            int df = termsEnum.docFreq();
            if (df < effectiveMincount) {
                term = termsEnum.next();
                continue;
            }

            if (termSet != null) {
                // termSet.decref(); // OFF-HEAP
                termSet = null;
            }

            int c = 0;

            if (hasSubFacets || df >= minDfFilterCache) {
                // use the filter cache

                if (deState == null) {
                    deState = new SolrIndexSearcher.DocsEnumState();
                    deState.fieldName = sf.getName();
                    deState.liveDocs = fcontext.searcher.getSlowAtomicReader().getLiveDocs();
                    deState.termsEnum = termsEnum;
                    deState.postingsEnum = postingsEnum;
                    deState.minSetSizeCached = minDfFilterCache;
                }

                if (hasSubFacets || !countOnly) {
                    DocSet termsAll = fcontext.searcher.getDocSet(deState);
                    termSet = docs.intersection(termsAll);
                    // termsAll.decref(); // OFF-HEAP
                    c = termSet.size();
                } else {
                    c = fcontext.searcher.numDocs(docs, deState);
                }
                postingsEnum = deState.postingsEnum;

                resetStats();

                if (!countOnly) {
                    collect(termSet, 0);
                }

            } else {
                // We don't need the docset here (meaning no sub-facets).
                // if countOnly, then we are calculating some other stats...
                resetStats();

                // lazy convert to fastForRandomSet
                if (fastForRandomSet == null) {
                    fastForRandomSet = docs;
                    if (docs instanceof SortedIntDocSet) { // OFF-HEAP todo: also check for native version
                        SortedIntDocSet sset = (SortedIntDocSet) docs;
                        fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
                    }
                }
                // iterate over TermDocs to calculate the intersection
                postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);

                if (postingsEnum instanceof MultiPostingsEnum) {
                    MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
                    int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                    for (int subindex = 0; subindex < numSubs; subindex++) {
                        MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                        if (sub.postingsEnum == null)
                            continue;
                        int base = sub.slice.start;
                        int docid;

                        if (countOnly) {
                            while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid + base))
                                    c++;
                            }
                        } else {
                            setNextReader(leaves[sub.slice.readerIndex]);
                            while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                                if (fastForRandomSet.exists(docid + base)) {
                                    c++;
                                    collect(docid, 0);
                                }
                            }
                        }

                    }
                } else {
                    int docid;
                    if (countOnly) {
                        while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                            if (fastForRandomSet.exists(docid))
                                c++;
                        }
                    } else {
                        setNextReader(leaves[0]);
                        while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                            if (fastForRandomSet.exists(docid)) {
                                c++;
                                collect(docid, 0);
                            }
                        }
                    }
                }

            }

            if (c < effectiveMincount) {
                term = termsEnum.next();
                continue;
            }

            // handle offset and limit
            if (bucketsToSkip > 0) {
                bucketsToSkip--;
                term = termsEnum.next();
                continue;
            }

            if (freq.limit >= 0 && ++bucketsReturned > freq.limit) {
                return null;
            }

            // set count in case other stats depend on it
            countAcc.incrementCount(0, c);

            // OK, we have a good bucket to return... first get bucket value before moving to next term
            Object bucketVal = sf.getType().toObject(sf, term);
            TermQuery bucketQuery = hasSubFacets ? new TermQuery(new Term(freq.field, term)) : null;
            term = termsEnum.next();

            SimpleOrderedMap<Object> bucket = new SimpleOrderedMap<>();
            bucket.add("val", bucketVal);
            addStats(bucket, 0);
            if (hasSubFacets) {
                processSubs(bucket, bucketQuery, termSet);
            }

            // TODO... termSet needs to stick around for streaming sub-facets?

            return bucket;

        }

    } finally {
        if (termSet != null) {
            // termSet.decref();  // OFF-HEAP
            termSet = null;
        }
    }

    // end of the iteration
    return null;
}

From source file:org.apache.solr.uninverting.DocTermOrds.java

License:Apache License

/** Call this only once (if you subclass!) */
protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException(
                "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }/* ww  w.ja  va 2s.  c  o  m*/
    //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
        // No terms
        return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    //System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
        // No terms match
        return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (;;) {
        final BytesRef t = te.term();
        if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
            break;
        }
        //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

        visitTerm(te, termNum);

        if ((termNum & indexIntervalMask) == 0) {
            // Index this term
            sizeOfIndexedStrings += t.length;
            BytesRef indexedTerm = new BytesRef();
            indexedTermsBytes.copy(t, indexedTerm);
            // TODO: really should 1) strip off useless suffix,
            // and 2) use FST not array/PagedBytes
            indexedTerms.add(indexedTerm);
        }

        final int df = te.docFreq();
        if (df <= maxTermDocFreq) {

            postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

            // dF, but takes deletions into account
            int actualDF = 0;

            for (;;) {
                int doc = postingsEnum.nextDoc();
                if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                    break;
                }
                //System.out.println("  chunk=" + chunk + " docs");

                actualDF++;
                termInstances++;

                //System.out.println("    docID=" + doc);
                // add TNUM_OFFSET to the term number to make room for special reserved values:
                // 0 (end term) and 1 (index into byte array follows)
                int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                lastTerm[doc] = termNum;
                int val = index[doc];

                if ((val & 0xff) == 1) {
                    // index into byte array (actually the end of
                    // the doc-specific byte[] when building)
                    int pos = val >>> 8;
                    int ilen = vIntSize(delta);
                    byte[] arr = bytes[doc];
                    int newend = pos + ilen;
                    if (newend > arr.length) {
                        // We avoid a doubling strategy to lower memory usage.
                        // this faceting method isn't for docs with many terms.
                        // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                        // TODO: figure out what array lengths we can round up to w/o actually using more memory
                        // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                        // It should be safe to round up to the nearest 32 bits in any case.
                        int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
                        byte[] newarr = new byte[newLen];
                        System.arraycopy(arr, 0, newarr, 0, pos);
                        arr = newarr;
                        bytes[doc] = newarr;
                    }
                    pos = writeInt(delta, arr, pos);
                    index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                } else {
                    // OK, this int has data in it... find the end (a zero starting byte - not
                    // part of another number, hence not following a byte with the high bit set).
                    int ipos;
                    if (val == 0) {
                        ipos = 0;
                    } else if ((val & 0x0000ff80) == 0) {
                        ipos = 1;
                    } else if ((val & 0x00ff8000) == 0) {
                        ipos = 2;
                    } else if ((val & 0xff800000) == 0) {
                        ipos = 3;
                    } else {
                        ipos = 4;
                    }

                    //System.out.println("      ipos=" + ipos);

                    int endPos = writeInt(delta, tempArr, ipos);
                    //System.out.println("      endpos=" + endPos);
                    if (endPos <= 4) {
                        //System.out.println("      fits!");
                        // value will fit in the integer... move bytes back
                        for (int j = ipos; j < endPos; j++) {
                            val |= (tempArr[j] & 0xff) << (j << 3);
                        }
                        index[doc] = val;
                    } else {
                        // value won't fit... move integer into byte[]
                        for (int j = 0; j < ipos; j++) {
                            tempArr[j] = (byte) val;
                            val >>>= 8;
                        }
                        // point at the end index in the byte[]
                        index[doc] = (endPos << 8) | 1;
                        bytes[doc] = tempArr;
                        tempArr = new byte[12];
                    }
                }
            }
            setActualDocFreq(termNum, actualDF);
        }

        termNum++;
        if (te.next() == null) {
            break;
        }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
        // we didn't invert anything
        // lower memory consumption.
        tnums = null;
    } else {

        this.index = index;

        //
        // transform intermediate form into the final form, building a single byte[]
        // at a time, and releasing the intermediate byte[]s as we go to avoid
        // increasing the memory footprint.
        //

        for (int pass = 0; pass < 256; pass++) {
            byte[] target = tnums[pass];
            int pos = 0; // end in target;
            if (target != null) {
                pos = target.length;
            } else {
                target = new byte[4096];
            }

            // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
            // where pp is the pass (which array we are building), and xx is all values.
            // each pass shares the same byte[] for termNumber lists.
            for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
                int lim = Math.min(docbase + (1 << 16), maxDoc);
                for (int doc = docbase; doc < lim; doc++) {
                    //System.out.println("  pass=" + pass + " process docID=" + doc);
                    int val = index[doc];
                    if ((val & 0xff) == 1) {
                        int len = val >>> 8;
                        //System.out.println("    ptr pos=" + pos);
                        index[doc] = (pos << 8) | 1; // change index to point to start of array
                        if ((pos & 0xff000000) != 0) {
                            // we only have 24 bits for the array index
                            throw new IllegalStateException(
                                    "Too many values for UnInvertedField faceting on field " + field);
                        }
                        byte[] arr = bytes[doc];
                        /*
                        for(byte b : arr) {
                          //System.out.println("      b=" + Integer.toHexString((int) b));
                        }
                        */
                        bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                        if (target.length <= pos + len) {
                            int newlen = target.length;
                            /*** we don't have to worry about the array getting too large
                             * since the "pos" param will overflow first (only 24 bits available)
                            if ((newlen<<1) <= 0) {
                              // overflow...
                              newlen = Integer.MAX_VALUE;
                              if (newlen <= pos + len) {
                                throw new SolrException(400,"Too many terms to uninvert field!");
                              }
                            } else {
                              while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                            }
                            ****/
                            while (newlen <= pos + len)
                                newlen <<= 1; // doubling strategy                 
                            byte[] newtarget = new byte[newlen];
                            System.arraycopy(target, 0, newtarget, 0, pos);
                            target = newtarget;
                        }
                        System.arraycopy(arr, 0, target, pos, len);
                        pos += len + 1; // skip single byte at end and leave it 0 for terminator
                    }
                }
            }

            // shrink array
            if (pos < target.length) {
                byte[] newtarget = new byte[pos];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
            }

            tnums[pass] = target;

            if ((pass << 16) > maxDoc)
                break;
        }

    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
}

From source file:org.apache.solr.uninverting.TestDocTermOrds.java

License:Apache License

public void testRandomWithPrefix() throws Exception {
    Directory dir = newDirectory();/* ww  w. j av  a  2s.  c o m*/

    final Set<String> prefixes = new HashSet<>();
    final int numPrefix = TestUtil.nextInt(random(), 2, 7);
    if (VERBOSE) {
        System.out.println("TEST: use " + numPrefix + " prefixes");
    }
    while (prefixes.size() < numPrefix) {
        prefixes.add(TestUtil.randomRealisticUnicodeString(random()));
        //prefixes.add(_TestUtil.randomSimpleString(random));
    }
    final String[] prefixesArray = prefixes.toArray(new String[prefixes.size()]);

    final int NUM_TERMS = atLeast(20);
    final Set<BytesRef> terms = new HashSet<>();
    while (terms.size() < NUM_TERMS) {
        final String s = prefixesArray[random().nextInt(prefixesArray.length)]
                + TestUtil.randomRealisticUnicodeString(random());
        //final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomSimpleString(random);
        if (s.length() > 0) {
            terms.add(new BytesRef(s));
        }
    }
    final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
    Arrays.sort(termsArray);

    final int NUM_DOCS = atLeast(100);

    IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));

    // Sometimes swap in codec that impls ord():
    if (random().nextInt(10) == 7) {
        Codec codec = TestUtil.alwaysPostingsFormat(TestUtil.getPostingsFormatWithOrds(random()));
        conf.setCodec(codec);
    }

    final RandomIndexWriter w = new RandomIndexWriter(random(), dir, conf);

    final int[][] idToOrds = new int[NUM_DOCS][];
    final Set<Integer> ordsForDocSet = new HashSet<>();

    for (int id = 0; id < NUM_DOCS; id++) {
        Document doc = new Document();

        doc.add(new LegacyIntField("id", id, Field.Store.YES));

        final int termCount = TestUtil.nextInt(random(), 0, 20 * RANDOM_MULTIPLIER);
        while (ordsForDocSet.size() < termCount) {
            ordsForDocSet.add(random().nextInt(termsArray.length));
        }
        final int[] ordsForDoc = new int[termCount];
        int upto = 0;
        if (VERBOSE) {
            System.out.println("TEST: doc id=" + id);
        }
        for (int ord : ordsForDocSet) {
            ordsForDoc[upto++] = ord;
            Field field = newStringField("field", termsArray[ord].utf8ToString(), Field.Store.NO);
            if (VERBOSE) {
                System.out.println("  f=" + termsArray[ord].utf8ToString());
            }
            doc.add(field);
        }
        ordsForDocSet.clear();
        Arrays.sort(ordsForDoc);
        idToOrds[id] = ordsForDoc;
        w.addDocument(doc);
    }

    final DirectoryReader r = w.getReader();
    w.close();

    if (VERBOSE) {
        System.out.println("TEST: reader=" + r);
    }

    LeafReader slowR = SlowCompositeReaderWrapper.wrap(r);
    TestUtil.checkReader(slowR);
    for (String prefix : prefixesArray) {

        final BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix);

        final int[][] idToOrdsPrefix = new int[NUM_DOCS][];
        for (int id = 0; id < NUM_DOCS; id++) {
            final int[] docOrds = idToOrds[id];
            final List<Integer> newOrds = new ArrayList<>();
            for (int ord : idToOrds[id]) {
                if (StringHelper.startsWith(termsArray[ord], prefixRef)) {
                    newOrds.add(ord);
                }
            }
            final int[] newOrdsArray = new int[newOrds.size()];
            int upto = 0;
            for (int ord : newOrds) {
                newOrdsArray[upto++] = ord;
            }
            idToOrdsPrefix[id] = newOrdsArray;
        }

        for (LeafReaderContext ctx : r.leaves()) {
            if (VERBOSE) {
                System.out.println("\nTEST: sub=" + ctx.reader());
            }
            verify(ctx.reader(), idToOrdsPrefix, termsArray, prefixRef);
        }

        // Also test top-level reader: its enum does not support
        // ord, so this forces the OrdWrapper to run:
        if (VERBOSE) {
            System.out.println("TEST: top reader");
        }
        verify(slowR, idToOrdsPrefix, termsArray, prefixRef);
    }

    FieldCache.DEFAULT.purgeByCacheKey(slowR.getCoreCacheKey());

    r.close();
    dir.close();
}

From source file:org.apache.solr.uninverting.TestDocTermOrds.java

License:Apache License

private void verify(LeafReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef)
        throws Exception {

    final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE,
            TestUtil.nextInt(random(), 2, 10));

    final NumericDocValues docIDToID = FieldCache.DEFAULT.getNumerics(r, "id", FieldCache.LEGACY_INT_PARSER);
    /*/*from  w ww.  j  a  v  a2  s.  c  o m*/
      for(int docID=0;docID<subR.maxDoc();docID++) {
      System.out.println("  docID=" + docID + " id=" + docIDToID[docID]);
      }
    */

    if (VERBOSE) {
        System.out.println("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString()));
        System.out.println("TEST: all TERMS:");
        TermsEnum allTE = MultiFields.getTerms(r, "field").iterator();
        int ord = 0;
        while (allTE.next() != null) {
            System.out.println("  ord=" + (ord++) + " term=" + allTE.term().utf8ToString());
        }
    }

    //final TermsEnum te = subR.fields().terms("field").iterator();
    final TermsEnum te = dto.getOrdTermsEnum(r);
    if (dto.numTerms() == 0) {
        if (prefixRef == null) {
            assertNull(MultiFields.getTerms(r, "field"));
        } else {
            Terms terms = MultiFields.getTerms(r, "field");
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator();
                TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef);
                if (result != TermsEnum.SeekStatus.END) {
                    assertFalse(
                            "term=" + termsEnum.term().utf8ToString() + " matches prefix="
                                    + prefixRef.utf8ToString(),
                            StringHelper.startsWith(termsEnum.term(), prefixRef));
                } else {
                    // ok
                }
            } else {
                // ok
            }
        }
        return;
    }

    if (VERBOSE) {
        System.out.println("TEST: TERMS:");
        te.seekExact(0);
        while (true) {
            System.out.println("  ord=" + te.ord() + " term=" + te.term().utf8ToString());
            if (te.next() == null) {
                break;
            }
        }
    }

    SortedSetDocValues iter = dto.iterator(r);
    for (int docID = 0; docID < r.maxDoc(); docID++) {
        assertEquals(docID, docIDToID.nextDoc());
        if (docID > iter.docID()) {
            iter.nextDoc();
        }
        if (docID < iter.docID()) {
            int[] answers = idToOrds[(int) docIDToID.longValue()];
            assertEquals(0, answers.length);
            continue;
        }

        if (VERBOSE) {
            System.out.println(
                    "TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.longValue() + ")");
        }
        final int[] answers = idToOrds[(int) docIDToID.longValue()];
        int upto = 0;
        long ord;
        while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
            te.seekExact(ord);
            final BytesRef expected = termsArray[answers[upto++]];
            if (VERBOSE) {
                System.out.println("  exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString());
            }
            assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord="
                    + ord, expected, te.term());
        }
        assertEquals(answers.length, upto);
    }
}

From source file:org.buzzinate.lezhi.query.LezhiTermsEnum.java

License:Apache License

protected AcceptStatus accept(BytesRef term) throws IOException {
    System.out.println(term.utf8ToString() + ", docfreq=" + docFreq());
    if (StringHelper.startsWith(term, prefixRef)) {
        return AcceptStatus.YES;
    } else {//from  w w w.java  2 s  . com
        return AcceptStatus.END;
    }
}