Example usage for org.apache.lucene.index TermsEnum termState

List of usage examples for org.apache.lucene.index TermsEnum termState

Introduction

In this page you can find the example usage for org.apache.lucene.index TermsEnum termState.

Prototype

public abstract TermState termState() throws IOException;

Source Link

Document

Expert: Returns the TermsEnums internal state to position the TermsEnum without re-seeking the term dictionary.

Usage

From source file:com.sindicetech.siren.search.node.TopNodeTermsRewrite.java

License:Open Source License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final int maxSize = Math.min(size, this.getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
    this.collectTerms(reader, query, new TermCollector() {
        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes
                .addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>();

        private TermsEnum termsEnum;
        private Comparator<BytesRef> termComp;
        private BoostAttribute boostAtt;
        private ScoreTerm st;

        @Override//  w  w w.j av  a 2 s .  com
        public void setNextEnum(final TermsEnum termsEnum) throws IOException {
            this.termsEnum = termsEnum;
            this.termComp = termsEnum.getComparator();

            assert this.compareToLastTerm(null);

            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRef lastTerm;

        private boolean compareToLastTerm(final BytesRef t) throws IOException {
            if (lastTerm == null && t != null) {
                lastTerm = BytesRef.deepCopyOf(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(final BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();

            // make sure within a single seg we always collect
            // terms in order
            assert this.compareToLastTerm(bytes);

            //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes, st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes);
                    st.termState.clear(); // reset the termstate!
                } else {
                    st = new ScoreTerm(termComp, new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes);
                }
            }

            return true;
        }
    });

    final Q q = this.getTopLevelQuery(query);
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp);

    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes);
        assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs "
                + st.termState.docFreq() + " term=" + term;
        this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
    }
    return q;
}

From source file:de.unihildesheim.iw.lucene.query.RelaxableCommonTermsQuery.java

License:Open Source License

/**
 * New instance using settings from the supplied {@link Builder} instance.
 *
 * @param builder {@link Builder} Instance builder
 * @throws IOException Thrown on low-level i/o-errors
 *//*from  ww  w  .ja  v  a2  s. c om*/
@SuppressWarnings({ "ObjectAllocationInLoop", "ObjectEquality" })
RelaxableCommonTermsQuery(@NotNull final Builder builder) throws IOException {
    // get all query terms
    assert builder.queryStr != null;
    assert builder.analyzer != null;
    this.queryTerms = QueryUtils.tokenizeQueryString(builder.queryStr, builder.analyzer);

    // list of unique terms contained in the query (stopped, analyzed)
    final String[] uniqueQueryTerms = this.queryTerms.stream().distinct().toArray(String[]::new);
    final int uniqueTermsCount = uniqueQueryTerms.length;

    // heavily based on code from org.apache.lucene.queries.CommonTermsQuery
    assert builder.reader != null;
    final List<LeafReaderContext> leaves = builder.reader.leaves();
    final int maxDoc = builder.reader.maxDoc();
    TermsEnum termsEnum = null;
    final List<Query> subQueries = new ArrayList<>(10);

    assert builder.fields != null;
    for (final String field : builder.fields) {
        final TermContext[] tcArray = new TermContext[uniqueTermsCount];
        final BooleanQuery lowFreq = new BooleanQuery();
        final BooleanQuery highFreq = new BooleanQuery();

        // collect term statistics
        for (int i = 0; i < uniqueTermsCount; i++) {
            final Term term = new Term(field, uniqueQueryTerms[i]);
            for (final LeafReaderContext context : leaves) {
                final TermContext termContext = tcArray[i];
                final Fields fields = context.reader().fields();
                final Terms terms = fields.terms(field);
                if (terms != null) {
                    // only, if field exists
                    termsEnum = terms.iterator(termsEnum);
                    if (termsEnum != TermsEnum.EMPTY) {
                        if (termsEnum.seekExact(term.bytes())) {
                            if (termContext == null) {
                                tcArray[i] = new TermContext(builder.reader.getContext(), termsEnum.termState(),
                                        context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                            } else {
                                termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(),
                                        termsEnum.totalTermFreq());
                            }
                        }
                    }
                }
            }

            // build query
            if (tcArray[i] == null) {
                lowFreq.add(new TermQuery(term), builder.lowFreqOccur);
            } else {
                if ((builder.maxTermFrequency >= 1f && (float) tcArray[i].docFreq() > builder.maxTermFrequency)
                        || (tcArray[i].docFreq() > (int) Math
                                .ceil((double) (builder.maxTermFrequency * (float) maxDoc)))) {
                    highFreq.add(new TermQuery(term, tcArray[i]), builder.highFreqOccur);
                } else {
                    lowFreq.add(new TermQuery(term, tcArray[i]), builder.lowFreqOccur);
                }
            }

            final int numLowFreqClauses = lowFreq.clauses().size();
            final int numHighFreqClauses = highFreq.clauses().size();
            if (builder.lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) {
                lowFreq.setMinimumNumberShouldMatch(numLowFreqClauses);
            }
            if (builder.highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) {
                highFreq.setMinimumNumberShouldMatch(numHighFreqClauses);
            }
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug("qLF={}", lowFreq);
            LOG.debug("qHF={}", highFreq);
        }

        if (lowFreq.clauses().isEmpty()) {
            subQueries.add(highFreq);
        } else if (highFreq.clauses().isEmpty()) {
            subQueries.add(lowFreq);
        } else {
            final BooleanQuery query = new BooleanQuery(true); // final query
            query.add(highFreq, Occur.SHOULD);
            query.add(lowFreq, Occur.MUST);
            subQueries.add(query);
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("qList={}", subQueries);
    }

    this.query = subQueries.size() == 1 ? subQueries.get(0) : new DisjunctionMaxQuery(subQueries, 0.1f);

    if (LOG.isDebugEnabled()) {
        LOG.debug("RCTQ {} uQt={}", this.query, uniqueQueryTerms);
    }
}

From source file:nl.inl.blacklab.search.lucene.BLSpanTermQuery.java

License:Apache License

/**
 * Overridden frmo SpanTermQuery to return a BLSpans instead.
 *//*from  w w w . j av  a2  s .  c o  m*/
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts)
        throws IOException {
    TermContext termContext = termContexts.get(term);
    final TermState state;
    if (termContext == null) {
        // this happens with span-not query, as it doesn't include the NOT
        // side in extractTerms()
        // so we seek to the term now in this segment..., this sucks because
        // its ugly mostly!
        final Fields fields = context.reader().fields();
        if (fields != null) {
            final Terms terms = fields.terms(term.field());
            if (terms != null) {
                final TermsEnum termsEnum = terms.iterator(null);
                if (termsEnum.seekExact(term.bytes(), true)) {
                    state = termsEnum.termState();
                } else {
                    state = null;
                }
            } else {
                state = null;
            }
        } else {
            state = null;
        }
    } else {
        state = termContext.get(context.ord);
    }

    if (state == null) { // term is not present in that reader
        return TermSpans.EMPTY_TERM_SPANS;
    }

    final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(null);
    termsEnum.seekExact(term.bytes(), state);

    final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null,
            DocsAndPositionsEnum.FLAG_PAYLOADS);

    if (postings != null) {
        return new TermSpans(postings, term);
    }
    // term does exist, but has no positions
    throw new IllegalStateException("field \"" + term.field()
            + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")");
}

From source file:org.sindice.siren.search.node.TopNodeTermsRewrite.java

License:Apache License

@Override
public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException {
    final int maxSize = Math.min(size, this.getMaxSize());
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
    this.collectTerms(reader, query, new TermCollector() {
        private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes
                .addAttribute(MaxNonCompetitiveBoostAttribute.class);

        private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>();

        private TermsEnum termsEnum;
        private Comparator<BytesRef> termComp;
        private BoostAttribute boostAtt;
        private ScoreTerm st;

        @Override/* ww w  . j  a  va  2s  . c om*/
        public void setNextEnum(final TermsEnum termsEnum) throws IOException {
            this.termsEnum = termsEnum;
            this.termComp = termsEnum.getComparator();

            assert this.compareToLastTerm(null);

            // lazy init the initial ScoreTerm because comparator is not known on ctor:
            if (st == null)
                st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
            boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
        }

        // for assert:
        private BytesRef lastTerm;

        private boolean compareToLastTerm(final BytesRef t) throws IOException {
            if (lastTerm == null && t != null) {
                lastTerm = BytesRef.deepCopyOf(t);
            } else if (t == null) {
                lastTerm = null;
            } else {
                assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t;
                lastTerm.copyBytes(t);
            }
            return true;
        }

        @Override
        public boolean collect(final BytesRef bytes) throws IOException {
            final float boost = boostAtt.getBoost();

            // make sure within a single seg we always collect
            // terms in order
            assert this.compareToLastTerm(bytes);

            //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
            // ignore uncompetitive hits
            if (stQueue.size() == maxSize) {
                final ScoreTerm t = stQueue.peek();
                if (boost < t.boost)
                    return true;
                if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0)
                    return true;
            }
            ScoreTerm t = visitedTerms.get(bytes);
            final TermState state = termsEnum.termState();
            assert state != null;
            if (t != null) {
                // if the term is already in the PQ, only update docFreq of term in PQ
                assert t.boost == boost : "boost should be equal in all segment TermsEnums";
                t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
            } else {
                // add new entry in PQ, we must clone the term, else it may get overwritten!
                st.bytes.copyBytes(bytes);
                st.boost = boost;
                visitedTerms.put(st.bytes, st);
                assert st.termState.docFreq() == 0;
                st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
                stQueue.offer(st);
                // possibly drop entries from queue
                if (stQueue.size() > maxSize) {
                    st = stQueue.poll();
                    visitedTerms.remove(st.bytes);
                    st.termState.clear(); // reset the termstate!
                } else {
                    st = new ScoreTerm(termComp, new TermContext(topReaderContext));
                }
                assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
                // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                if (stQueue.size() == maxSize) {
                    t = stQueue.peek();
                    maxBoostAtt.setMaxNonCompetitiveBoost(t.boost);
                    maxBoostAtt.setCompetitiveTerm(t.bytes);
                }
            }

            return true;
        }
    });

    final Q q = this.getTopLevelQuery();
    final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
    ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp);

    for (final ScoreTerm st : scoreTerms) {
        final Term term = new Term(query.field, st.bytes);
        assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs "
                + st.termState.docFreq() + " term=" + term;
        this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
    }
    return q;
}

From source file:project.lucene.RelativeTermWeightQuery.java

License:Apache License

public void collectTermContext(IndexReader reader, List<AtomicReaderContext> leaves, TermContext[] contextArray,
        Term[] queryTerms) throws IOException {
    TermsEnum termsEnum = null;
    for (AtomicReaderContext context : leaves) {
        final Fields fields = context.reader().fields();
        if (fields == null) {
            // reader has no fields
            continue;
        }//  w  w w .j a  v a 2  s  .  c  o  m
        for (int i = 0; i < queryTerms.length; i++) {
            Term term = queryTerms[i];
            TermContext termContext = contextArray[i];
            final Terms terms = fields.terms(term.field());
            if (terms == null) {
                // field does not exist
                continue;
            }
            termsEnum = terms.iterator(termsEnum);
            assert termsEnum != null;

            if (termsEnum == TermsEnum.EMPTY)
                continue;
            if (termsEnum.seekExact(term.bytes())) {
                if (termContext == null) {
                    contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord,
                            termsEnum.docFreq(), termsEnum.totalTermFreq());
                } else {
                    termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(),
                            termsEnum.totalTermFreq());
                }
            }
        }
    }
}