List of usage examples for org.apache.lucene.util BytesRef deepCopyOf
public static BytesRef deepCopyOf(BytesRef other)
other
The returned BytesRef will have a length of other.length and an offset of zero.
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
private void assertTermsSeeking(Terms leftTerms, Terms rightTerms) throws Exception { TermsEnum leftEnum = null;// w ww.ja va 2 s .co m TermsEnum rightEnum = null; // just an upper bound int numTests = atLeast(20); Random random = random(); // collect this number of terms from the left side HashSet<BytesRef> tests = new HashSet<>(); int numPasses = 0; while (numPasses < 10 && tests.size() < numTests) { leftEnum = leftTerms.iterator(); BytesRef term = null; while ((term = leftEnum.next()) != null) { int code = random.nextInt(10); if (code == 0) { // the term tests.add(BytesRef.deepCopyOf(term)); } else if (code == 1) { // truncated subsequence of term term = BytesRef.deepCopyOf(term); if (term.length > 0) { // truncate it term.length = random.nextInt(term.length); } } else if (code == 2) { // term, but ensure a non-zero offset byte newbytes[] = new byte[term.length + 5]; System.arraycopy(term.bytes, term.offset, newbytes, 5, term.length); tests.add(new BytesRef(newbytes, 5, term.length)); } } numPasses++; } ArrayList<BytesRef> shuffledTests = new ArrayList<>(tests); Collections.shuffle(shuffledTests, random); for (BytesRef b : shuffledTests) { leftEnum = leftTerms.iterator(); rightEnum = rightTerms.iterator(); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); assertEquals(leftEnum.seekExact(b), rightEnum.seekExact(b)); SeekStatus leftStatus; SeekStatus rightStatus; leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } leftStatus = leftEnum.seekCeil(b); rightStatus = rightEnum.seekCeil(b); assertEquals(leftStatus, rightStatus); if (leftStatus != SeekStatus.END) { assertEquals(leftEnum.term(), rightEnum.term()); } } }
From source file:com.shaie.utils.IndexUtils.java
License:Apache License
/** Prints the terms indexed under the given fields with full postings information. */ public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException { for (final String field : fields) { System.out.println(format("Terms for field [%s], with positional info:", field)); final TermsEnum te = reader.terms(field).iterator(); BytesRef scratch;//from w w w.j av a 2 s. co m PostingsEnum postings = null; while ((scratch = te.next()) != null) { System.out.println(format(" %s", scratch.utf8ToString())); postings = te.postings(postings, PostingsEnum.ALL); for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) { final Map<Integer, BytesRef> positions = Maps.newTreeMap(); boolean addedPayload = false; for (int i = 0; i < postings.freq(); i++) { final int pos = postings.nextPosition(); final BytesRef payload = postings.getPayload(); if (payload != null) { positions.put(pos, BytesRef.deepCopyOf(payload)); addedPayload = true; } else { positions.put(pos, null); } } if (addedPayload) { System.out.println( format(" doc=%d, freq=%d", postings.docID(), postings.freq(), positions)); for (final Entry<Integer, BytesRef> e : positions.entrySet()) { System.out.println(format(" pos=%d, payload=%s", e.getKey(), e.getValue())); } } else { System.out.println(format(" doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(), positions.keySet())); } } } } }
From source file:com.sindicetech.siren.search.node.TestNodeNumericRangeQuery32.java
License:Open Source License
private int countTerms(final MultiNodeTermQuery q) throws Exception { final Terms terms = MultiFields.getTerms(index.reader, q.getField()); if (terms == null) return 0; final TermsEnum termEnum = q.getTermsEnum(terms); assertNotNull(termEnum);//from w ww . j a v a2 s . c om int count = 0; BytesRef cur, last = null; while ((cur = termEnum.next()) != null) { count++; if (last != null) { assertTrue(last.compareTo(cur) < 0); } last = BytesRef.deepCopyOf(cur); } // LUCENE-3314: the results after next() already returned null are undefined, // assertNull(termEnum.next()); return count; }
From source file:com.sindicetech.siren.search.node.TopNodeTermsRewrite.java
License:Open Source License
@Override public Q rewrite(final IndexReader reader, final MultiNodeTermQuery query) throws IOException { final int maxSize = Math.min(size, this.getMaxSize()); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); this.collectTerms(reader, query, new TermCollector() { private final MaxNonCompetitiveBoostAttribute maxBoostAtt = attributes .addAttribute(MaxNonCompetitiveBoostAttribute.class); private final Map<BytesRef, ScoreTerm> visitedTerms = new HashMap<BytesRef, ScoreTerm>(); private TermsEnum termsEnum; private Comparator<BytesRef> termComp; private BoostAttribute boostAtt; private ScoreTerm st; @Override/*from ww w . j av a 2s .c o m*/ public void setNextEnum(final TermsEnum termsEnum) throws IOException { this.termsEnum = termsEnum; this.termComp = termsEnum.getComparator(); assert this.compareToLastTerm(null); // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) st = new ScoreTerm(this.termComp, new TermContext(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } // for assert: private BytesRef lastTerm; private boolean compareToLastTerm(final BytesRef t) throws IOException { if (lastTerm == null && t != null) { lastTerm = BytesRef.deepCopyOf(t); } else if (t == null) { lastTerm = null; } else { assert termsEnum.getComparator().compare(lastTerm, t) < 0 : "lastTerm=" + lastTerm + " t=" + t; lastTerm.copyBytes(t); } return true; } @Override public boolean collect(final BytesRef bytes) throws IOException { final float boost = boostAtt.getBoost(); // make sure within a single seg we always collect // terms in order assert this.compareToLastTerm(bytes); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.size() == maxSize) { final ScoreTerm t = stQueue.peek(); if (boost < t.boost) return true; if (boost == t.boost && termComp.compare(bytes, t.bytes) > 0) return true; } ScoreTerm t = visitedTerms.get(bytes); final TermState state = termsEnum.termState(); assert state != null; if (t != null) { // if the term is already in the PQ, only update docFreq of term in PQ assert t.boost == boost : "boost should be equal in all segment TermsEnums"; t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copyBytes(bytes); st.boost = boost; visitedTerms.put(st.bytes, st); assert st.termState.docFreq() == 0; st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); stQueue.offer(st); // possibly drop entries from queue if (stQueue.size() > maxSize) { st = stQueue.poll(); visitedTerms.remove(st.bytes); st.termState.clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.size() == maxSize) { t = stQueue.peek(); maxBoostAtt.setMaxNonCompetitiveBoost(t.boost); maxBoostAtt.setCompetitiveTerm(t.bytes); } } return true; } }); final Q q = this.getTopLevelQuery(query); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); ArrayUtil.timSort(scoreTerms, scoreTermSortByTermComp); for (final ScoreTerm st : scoreTerms) { final Term term = new Term(query.field, st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq() + " term=" + term; this.addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query } return q; }
From source file:com.stratio.cassandra.index.query.Condition.java
License:Apache License
protected String analyze(String field, String value, ColumnMapper<?> columnMapper) { TokenStream source = null;/*from ww w . j a va2 s . co m*/ try { Analyzer analyzer = columnMapper.analyzer(); source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:com.tcdi.zombodb.query.VisibilityQueryHelper.java
License:Apache License
static List<BytesRef> findUpdatedCtids(IndexSearcher searcher) throws IOException { final List<BytesRef> updatedCtids = new ArrayList<>(); ////from w w w . j a v a2s . co m // search the "state" type and collect a distinct set of all the _ctids // these represent the records in the index that have been updated // used below to determine visibility // // We use XConstantScoreQuery here so that we exclude deleted docs // searcher.search( new XConstantScoreQuery( SearchContext.current().filterCache().cache(new TermFilter(new Term("_type", "state")))), new ZomboDBTermsCollector("_ctid") { SortedDocValues ctids; @Override public void collect(int doc) throws IOException { updatedCtids.add(BytesRef.deepCopyOf(ctids.get(doc))); } @Override public void setNextReader(AtomicReaderContext context) throws IOException { ctids = FieldCache.DEFAULT.getTermsIndex(context.reader(), "_ctid"); } }); Collections.sort(updatedCtids); return updatedCtids; }
From source file:com.tcdi.zombodb.query.VisibilityQueryHelper.java
License:Apache License
static Map<Integer, FixedBitSet> determineVisibility(final Query query, final String field, final long myXid, final long xmin, final long xmax, final Set<Long> activeXids, IndexSearcher searcher, List<BytesRef> updatedCtids) throws IOException { final Map<Integer, FixedBitSet> visibilityBitSets = new HashMap<>(); if (updatedCtids.size() == 0) return visibilityBitSets; ///*from www . j ava2 s . co m*/ // build a map of {@link VisibilityInfo} objects by each _prev_ctid // // We use XConstantScoreQuery here so that we exclude deleted docs // final Map<BytesRef, List<VisibilityInfo>> map = new HashMap<>(); searcher.search( new XConstantScoreQuery( SearchContext.current().filterCache().cache(new TermsFilter(field, updatedCtids))), new ZomboDBTermsCollector(field) { private SortedDocValues prevCtids; private SortedNumericDocValues xids; private SortedNumericDocValues sequence; private int ord; private int maxdoc; @Override public void collect(int doc) throws IOException { xids.setDocument(doc); sequence.setDocument(doc); long xid = xids.valueAt(0); long seq = sequence.valueAt(0); BytesRef prevCtid = prevCtids.get(doc); List<VisibilityInfo> matchingDocs = map.get(prevCtid); if (matchingDocs == null) map.put(BytesRef.deepCopyOf(prevCtid), matchingDocs = new ArrayList<>()); matchingDocs.add(new VisibilityInfo(ord, maxdoc, doc, xid, seq)); } @Override public void setNextReader(AtomicReaderContext context) throws IOException { prevCtids = FieldCache.DEFAULT.getTermsIndex(context.reader(), field); xids = context.reader().getSortedNumericDocValues("_xid"); sequence = context.reader().getSortedNumericDocValues("_zdb_seq"); ord = context.ord; maxdoc = context.reader().maxDoc(); } }); if (map.isEmpty()) return visibilityBitSets; // // pick out the first VisibilityInfo for each document that is visible & committed // and build a FixedBitSet for each reader 'ord' that contains visible // documents. A map of these (key'd on reader ord) is what we return. // BytesRefBuilder bytesRefBuilder = new BytesRefBuilder() { /* overloaded to avoid making a copy of the byte array */ @Override public BytesRef toBytesRef() { return new BytesRef(this.bytes(), 0, this.length()); } }; Terms committedXidsTerms = MultiFields.getFields(searcher.getIndexReader()).terms("_zdb_committed_xid"); TermsEnum committedXidsEnum = committedXidsTerms == null ? null : committedXidsTerms.iterator(null); for (List<VisibilityInfo> visibility : map.values()) { CollectionUtil.introSort(visibility, new Comparator<VisibilityInfo>() { @Override public int compare(VisibilityInfo o1, VisibilityInfo o2) { int cmp = Long.compare(o2.xid, o1.xid); return cmp == 0 ? Long.compare(o2.sequence, o1.sequence) : cmp; } }); boolean foundVisible = false; for (VisibilityInfo mapping : visibility) { if (foundVisible || mapping.xid > xmax || activeXids.contains(mapping.xid) || (mapping.xid != myXid && !isCommitted(committedXidsEnum, mapping.xid, bytesRefBuilder))) { // document is not visible to us FixedBitSet visibilityBitset = visibilityBitSets.get(mapping.readerOrd); if (visibilityBitset == null) visibilityBitSets.put(mapping.readerOrd, visibilityBitset = new FixedBitSet(mapping.maxdoc)); visibilityBitset.set(mapping.docid); } else { foundVisible = true; } } } return visibilityBitSets; }
From source file:com.tuplejump.stargate.lucene.query.Condition.java
License:Apache License
protected String analyze(String field, String value, Analyzer analyzer) { TokenStream source = null;// w w w. j a v a 2 s . c o m try { source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java
License:Open Source License
@SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS") @Override/*from w w w . j a v a2 s . c o m*/ public long getTermFrequency(@NotNull final BytesRef term) { // try get a cached value first @Nullable Long tf = this.cache_tf.get(term); if (tf == null) { tf = 0L; for (final LeafReaderContext lrc : this.index.reader.leaves()) { final LeafReader r = lrc.reader(); long fieldTf = 0L; if (r.numDocs() > 0) { try { for (final String s : r.fields()) { @Nullable final Terms terms = r.terms(s); if (terms != null) { final TermsEnum termsEnum = terms.iterator(null); if (termsEnum.seekExact(term)) { fieldTf += termsEnum.totalTermFreq(); } } } } catch (final IOException e) { throw new UncheckedIOException(e); } } tf += fieldTf; } this.cache_tf.put(BytesRef.deepCopyOf(term), tf); } return tf; }
From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java
License:Open Source License
@SuppressFBWarnings("EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS") @Override/*w ww. j a va 2 s. c om*/ public int getDocumentFrequency(@NotNull final BytesRef term) { Integer df = this.cache_df.get(term); if (df == null) { df = this.index.reader.leaves().stream().map(LeafReaderContext::reader).filter(r -> r.numDocs() > 0) .mapToInt(r -> { try { return StreamSupport.stream(r.fields().spliterator(), false).mapToInt(f -> { try { @Nullable final Terms terms = r.terms(f); if (terms == null) { return 0; } final TermsEnum termsEnum = terms.iterator(null); return termsEnum.seekExact(term) ? termsEnum.docFreq() : 0; } catch (final IOException e) { throw new UncheckedIOException(e); } }).max().orElse(0); } catch (final IOException e) { throw new UncheckedIOException(e); } }).sum(); this.cache_df.put(BytesRef.deepCopyOf(term), df); } return df; }