Java tutorial
/** * Copyright (c) 2016, SIREn Solutions. All Rights Reserved. * * This file is part of the SIREn project. * * SIREn is a free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * SIREn is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public * License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package solutions.siren.join.index.query; import org.apache.lucene.index.*; import org.apache.lucene.search.*; import org.apache.lucene.util.*; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.Loggers; import solutions.siren.join.action.terms.collector.*; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Set; /** * Specialization for a disjunction over many terms, encoded in a byte array, which scans the dictionary * using a {@link TermsEnum} to collect documents ids. * It behaves like a {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only * {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses. */ public class TermsEnumTermsQuery extends Query implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator .shallowSizeOfInstance(TermsEnumTermsQuery.class); /** * Reference to the encoded list of terms for late decoding. */ private byte[] encodedTerms; /** * The set of terms after decoding */ private BytesRefTermsSet termsSet; /** * The field to enumerate */ protected String field; /** * The cache key for this query */ protected final long cacheKey; private static final ESLogger logger = Loggers.getLogger(TermsEnumTermsQuery.class); /** * Creates a new {@link TermsEnumTermsQuery} from the given field data. */ public TermsEnumTermsQuery(final byte[] encodedTerms, final String field, final long cacheKey) { this.encodedTerms = encodedTerms; this.cacheKey = cacheKey; this.field = field; } @Override public long ramBytesUsed() { BytesRefTermsSet termsSet = this.getTermsSet(); return BASE_RAM_BYTES_USED + termsSet.size() * 8; } @Override public String toString(String defaultField) { BytesRefTermsSet termsSet = this.getTermsSet(); final StringBuilder sb = new StringBuilder("TermsEnumTermsQuery:"); return sb.append(defaultField).append(":") // Do not serialise the full array, but instead the number of elements - see issue #168 .append("[size=" + termsSet.size() + "]").toString(); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!super.equals(obj)) { return false; } if (cacheKey != ((TermsEnumTermsQuery) obj).cacheKey) { // relies on the cache key instead of the encodedTerms for equality return false; } if (!field.equals(((TermsEnumTermsQuery) obj).field)) { return false; } return true; } @Override public int hashCode() { int hashcode = super.hashCode(); hashcode = 31 * hashcode + ((int) cacheKey); // relies on the cache key instead of the encodedTerms for hashcode hashcode = 31 * hashcode + field.hashCode(); return hashcode; } @Override public Collection<Accountable> getChildResources() { return Collections.emptyList(); } /** * Returns the set of terms. This method will perform a late-decoding of the encoded terms, and will release the * byte array. This method needs to be synchronized as each segment thread will call it concurrently. */ protected synchronized BytesRefTermsSet getTermsSet() { if (encodedTerms != null) { // late decoding of the encoded terms long start = System.nanoTime(); termsSet = (BytesRefTermsSet) TermsSet.readFrom(new BytesRef(encodedTerms)); logger.debug("{}: Deserialized {} terms - took {} ms", new Object[] { Thread.currentThread().getName(), termsSet.size(), (System.nanoTime() - start) / 1000000 }); encodedTerms = null; // release reference to the byte array to be able to reclaim memory } return termsSet; } public DocIdSet getDocIdSet(LeafReaderContext context) throws IOException { final Terms terms = context.reader().terms(field); // make sure the field exists if (terms == null) return null; final BytesRefTermsSet termsSet = this.getTermsSet(); // make sure there are terms to filter on if (termsSet == null || termsSet.isEmpty()) return null; SeekingTermSetTermsEnum termsEnum = new SeekingTermSetTermsEnum(terms.iterator(), termsSet); DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc()); PostingsEnum docs = null; while (termsEnum.next() != null) { docs = termsEnum.postings(docs, PostingsEnum.NONE); builder.add(docs); } return builder.build(); } @Override public Weight createWeight(final IndexSearcher searcher, final boolean needsScores) throws IOException { return new ConstantScoreWeight(new CacheKeyFieldDataTermsQuery(cacheKey)) { @Override public void extractTerms(Set<Term> terms) { // no-op // This query is for abuse cases when the number of terms is too high to // run efficiently as a BooleanQuery. So likewise we hide its terms in // order to protect highlighters } private Scorer scorer(DocIdSet set) throws IOException { if (set == null) { return null; } final DocIdSetIterator disi = set.iterator(); if (disi == null) { return null; } return new ConstantScoreScorer(this, score(), disi); } @Override public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { final Scorer scorer = scorer(TermsEnumTermsQuery.this.getDocIdSet(context)); if (scorer == null) { return null; } return new DefaultBulkScorer(scorer); } @Override public Scorer scorer(LeafReaderContext context) throws IOException { return scorer(TermsEnumTermsQuery.this.getDocIdSet(context)); } }; } /** * <p> * This query will be returned by the {@link ConstantScoreWeight} instead of the {@link TermsEnumTermsQuery} * and used by the * {@link LRUQueryCache.CachingWrapperWeight} to cache the query. * This is necessary in order to avoid caching the byte array and long hash set, which is not memory friendly * and not very efficient. * </p> * <p> * Extends MultiTermQuery in order to be detected as "costly" query by {@link UsageTrackingQueryCachingPolicy} * and trigger early caching. * </p> */ private static class CacheKeyFieldDataTermsQuery extends MultiTermQuery { private final long cacheKey; public CacheKeyFieldDataTermsQuery(long cacheKey) { super(""); this.cacheKey = cacheKey; } @Override public String toString(String field) { final StringBuilder sb = new StringBuilder("CacheKeyFieldDataTermsQuery:"); return sb.append(field).append(":").append("[cacheKey=" + cacheKey + "]").toString(); } @Override public boolean equals(Object o) { if (!(o instanceof CacheKeyFieldDataTermsQuery)) return false; CacheKeyFieldDataTermsQuery other = (CacheKeyFieldDataTermsQuery) o; return super.equals(o) && this.cacheKey == other.cacheKey; } @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { return null; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((int) cacheKey); return result; } } static class SeekingTermSetTermsEnum extends FilteredTermsEnum { private final BytesRefHash terms; private final int[] ords; private final int lastElement; private final BytesRef lastTerm; private final BytesRef spare = new BytesRef(); private BytesRef seekTerm; private int upto = 0; SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefTermsSet termsSet) { super(tenum); this.terms = termsSet.getBytesRefHash(); this.ords = this.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); lastElement = terms.size() - 1; lastTerm = terms.get(ords[lastElement], new BytesRef()); seekTerm = terms.get(ords[upto], spare); } @Override protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException { BytesRef temp = seekTerm; seekTerm = null; return temp; } @Override protected AcceptStatus accept(BytesRef term) throws IOException { if (term.compareTo(lastTerm) > 0) { return AcceptStatus.END; } BytesRef currentTerm = terms.get(ords[upto], spare); if (term.compareTo(currentTerm) == 0) { if (upto == lastElement) { return AcceptStatus.YES; } else { seekTerm = terms.get(ords[++upto], spare); return AcceptStatus.YES_AND_SEEK; } } else { if (upto == lastElement) { return AcceptStatus.NO; } else { // Our current term doesn't match the the given term. int cmp; do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher. if (upto == lastElement) { return AcceptStatus.NO; } // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of // our terms so we don't do a binary search here seekTerm = terms.get(ords[++upto], spare); } while ((cmp = seekTerm.compareTo(term)) < 0); if (cmp == 0) { if (upto == lastElement) { return AcceptStatus.YES; } seekTerm = terms.get(ords[++upto], spare); return AcceptStatus.YES_AND_SEEK; } else { return AcceptStatus.NO_AND_SEEK; } } } } } }