List of usage examples for org.apache.lucene.index TermsEnum termState
public abstract TermState termState() throws IOException;
From source file:com.sindicetech.siren.search.node.TopNodeTermsRewrite.java
License:Open Source License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final int maxSize = Math.min(size, this.getMaxSize()); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); this.collectTerms(reader, query, new TermCollector() { private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes .addAttribute(MaxNonCompetitiveBoostAttribute.class); private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>(); private TermsEnum termsEnum; private Comparator<BytesRef> termComp; private BoostAttribute boostAtt; private ScoreTerm st; @Override// w w w.j av a 2 s . com public void setNextEnum(final TermsEnum termsEnum) throws IOException { this.termsEnum = termsEnum; this.termComp = termsEnum.getComparator(); assert this.compareToLastTerm(null); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } // for assert: private BytesRef lastTerm; private boolean compareToLastTerm(final BytesRef t) throws IOException { if (lastTerm == null && t != null) { lastTerm = BytesRef.deepCopyOf(t); } else if (t == null) { lastTerm = null; } else { assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t; lastTerm.copyBytes(t); } return true; } @Override public boolean collect(final BytesRef bytes) throws IOException { final float boost = boostAtt.getBoost(); // make sure within a single seg we always collect // terms in order assert this.compareToLastTerm(bytes); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.size() == maxSize) { final ScoreTerm t = stQueue.peek(); if (boost < t.boost) return true; if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) return true; } ScoreTerm t = visitedTerms.get(bytes); final TermState state = termsEnum.termState(); assert state != null; if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copyBytes(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { st = stQueue.poll(); visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.size() == maxSize) { t = stQueue.peek(); maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); maxBoostAtt.setCompetitiveTerm(t.bytes); } } return true; } }); final Q q = this.getTopLevelQuery(query); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp); for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } return q; }
From source file:de.unihildesheim.iw.lucene.query.RelaxableCommonTermsQuery.java
License:Open Source License
/** * New instance using settings from the supplied {@link Builder} instance. * * @param builder {@link Builder} Instance builder * @throws IOException Thrown on low-level i/o-errors *//*from ww w .ja v a2 s. c om*/ @SuppressWarnings({ "ObjectAllocationInLoop", "ObjectEquality" }) RelaxableCommonTermsQuery(@NotNull final Builder builder) throws IOException { // get all query terms assert builder.queryStr != null; assert builder.analyzer != null; this.queryTerms = QueryUtils.tokenizeQueryString(builder.queryStr, builder.analyzer); // list of unique terms contained in the query (stopped, analyzed) final String[] uniqueQueryTerms = this.queryTerms.stream().distinct().toArray(String[]::new); final int uniqueTermsCount = uniqueQueryTerms.length; // heavily based on code from org.apache.lucene.queries.CommonTermsQuery assert builder.reader != null; final List<LeafReaderContext> leaves = builder.reader.leaves(); final int maxDoc = builder.reader.maxDoc(); TermsEnum termsEnum = null; final List<Query> subQueries = new ArrayList<>(10); assert builder.fields != null; for (final String field : builder.fields) { final TermContext[] tcArray = new TermContext[uniqueTermsCount]; final BooleanQuery lowFreq = new BooleanQuery(); final BooleanQuery highFreq = new BooleanQuery(); // collect term statistics for (int i = 0; i < uniqueTermsCount; i++) { final Term term = new Term(field, uniqueQueryTerms[i]); for (final LeafReaderContext context : leaves) { final TermContext termContext = tcArray[i]; final Fields fields = context.reader().fields(); final Terms terms = fields.terms(field); if (terms != null) { // only, if field exists termsEnum = terms.iterator(termsEnum); if (termsEnum != TermsEnum.EMPTY) { if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { tcArray[i] = new TermContext(builder.reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } } // build query if (tcArray[i] == null) { lowFreq.add(new TermQuery(term), builder.lowFreqOccur); } else { if ((builder.maxTermFrequency >= 1f && (float) tcArray[i].docFreq() > builder.maxTermFrequency) || (tcArray[i].docFreq() > (int) Math .ceil((double) (builder.maxTermFrequency * (float) maxDoc)))) { highFreq.add(new TermQuery(term, tcArray[i]), builder.highFreqOccur); } else { lowFreq.add(new TermQuery(term, tcArray[i]), builder.lowFreqOccur); } } final int numLowFreqClauses = lowFreq.clauses().size(); final int numHighFreqClauses = highFreq.clauses().size(); if (builder.lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) { lowFreq.setMinimumNumberShouldMatch(numLowFreqClauses); } if (builder.highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) { highFreq.setMinimumNumberShouldMatch(numHighFreqClauses); } } if (LOG.isDebugEnabled()) { LOG.debug("qLF={}", lowFreq); LOG.debug("qHF={}", highFreq); } if (lowFreq.clauses().isEmpty()) { subQueries.add(highFreq); } else if (highFreq.clauses().isEmpty()) { subQueries.add(lowFreq); } else { final BooleanQuery query = new BooleanQuery(true); // final query query.add(highFreq, Occur.SHOULD); query.add(lowFreq, Occur.MUST); subQueries.add(query); } } if (LOG.isDebugEnabled()) { LOG.debug("qList={}", subQueries); } this.query = subQueries.size() == 1 ? subQueries.get(0) : new DisjunctionMaxQuery(subQueries, 0.1f); if (LOG.isDebugEnabled()) { LOG.debug("RCTQ {} uQt={}", this.query, uniqueQueryTerms); } }
From source file:nl.inl.blacklab.search.lucene.BLSpanTermQuery.java
License:Apache License
/** * Overridden frmo SpanTermQuery to return a BLSpans instead. *//*from w w w . j av a2 s . c o m*/ @Override public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts) throws IOException { TermContext termContext = termContexts.get(term); final TermState state; if (termContext == null) { // this happens with span-not query, as it doesn't include the NOT // side in extractTerms() // so we seek to the term now in this segment..., this sucks because // its ugly mostly! final Fields fields = context.reader().fields(); if (fields != null) { final Terms terms = fields.terms(term.field()); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(term.bytes(), true)) { state = termsEnum.termState(); } else { state = null; } } else { state = null; } } else { state = null; } } else { state = termContext.get(context.ord); } if (state == null) { // term is not present in that reader return TermSpans.EMPTY_TERM_SPANS; } final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(null); termsEnum.seekExact(term.bytes(), state); final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null, DocsAndPositionsEnum.FLAG_PAYLOADS); if (postings != null) { return new TermSpans(postings, term); } // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")"); }
From source file:org.sindice.siren.search.node.TopNodeTermsRewrite.java
License:Apache License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final int maxSize = Math.min(size, this.getMaxSize()); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); this.collectTerms(reader, query, new TermCollector() { private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes .addAttribute(MaxNonCompetitiveBoostAttribute.class); private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>(); private TermsEnum termsEnum; private Comparator<BytesRef> termComp; private BoostAttribute boostAtt; private ScoreTerm st; @Override/* ww w . j a va 2s . c om*/ public void setNextEnum(final TermsEnum termsEnum) throws IOException { this.termsEnum = termsEnum; this.termComp = termsEnum.getComparator(); assert this.compareToLastTerm(null); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } // for assert: private BytesRef lastTerm; private boolean compareToLastTerm(final BytesRef t) throws IOException { if (lastTerm == null && t != null) { lastTerm = BytesRef.deepCopyOf(t); } else if (t == null) { lastTerm = null; } else { assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t; lastTerm.copyBytes(t); } return true; } @Override public boolean collect(final BytesRef bytes) throws IOException { final float boost = boostAtt.getBoost(); // make sure within a single seg we always collect // terms in order assert this.compareToLastTerm(bytes); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.size() == maxSize) { final ScoreTerm t = stQueue.peek(); if (boost < t.boost) return true; if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) return true; } ScoreTerm t = visitedTerms.get(bytes); final TermState state = termsEnum.termState(); assert state != null; if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copyBytes(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { st = stQueue.poll(); visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.size() == maxSize) { t = stQueue.peek(); maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); maxBoostAtt.setCompetitiveTerm(t.bytes); } } return true; } }); final Q q = this.getTopLevelQuery(); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp); for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } return q; }
From source file:project.lucene.RelativeTermWeightQuery.java
License:Apache License
public void collectTermContext(IndexReader reader, List<AtomicReaderContext> leaves, TermContext[] contextArray, Term[] queryTerms) throws IOException { TermsEnum termsEnum = null; for (AtomicReaderContext context : leaves) { final Fields fields = context.reader().fields(); if (fields == null) { // reader has no fields continue; }// w w w .j a v a 2 s . c o m for (int i = 0; i < queryTerms.length; i++) { Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { // field does not exist continue; } termsEnum = terms.iterator(termsEnum); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } }