List of usage examples for org.apache.lucene.index TermsEnum ord
public abstract long ord() throws IOException;
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();/*from ww w .j a v a 2s. co m*/ MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:org.apache.solr.request.TestFaceting.java
License:Apache License
void doTermEnum(int size) throws Exception { //System.out.println("doTermEnum size=" + size); close();/* w ww . j a v a 2 s. c o m*/ createIndex(size); req = lrf.makeRequest("q", "*:*"); UnInvertedField uif = new UnInvertedField(proto.field(), req.getSearcher()); assertEquals(size, uif.getNumTerms()); TermsEnum te = uif.getOrdTermsEnum(req.getSearcher().getAtomicReader()); assertEquals(size == 0, te == null); Random r = new Random(size); // test seeking by term string for (int i = 0; i < size * 2 + 10; i++) { int rnum = r.nextInt(size + 2); String s = t(rnum); //System.out.println("s=" + s); final BytesRef br; if (te == null) { br = null; } else { TermsEnum.SeekStatus status = te.seekCeil(new BytesRef(s)); if (status == TermsEnum.SeekStatus.END) { br = null; } else { br = te.term(); } } assertEquals(br != null, rnum < size); if (rnum < size) { assertEquals(rnum, (int) te.ord()); assertEquals(s, te.term().utf8ToString()); } } // test seeking before term if (size > 0) { assertEquals(size > 0, te.seekCeil(new BytesRef("000")) != TermsEnum.SeekStatus.END); assertEquals(0, te.ord()); assertEquals(t(0), te.term().utf8ToString()); } if (size > 0) { // test seeking by term number for (int i = 0; i < size * 2 + 10; i++) { int rnum = r.nextInt(size); String s = t(rnum); te.seekExact((long) rnum); BytesRef br = te.term(); assertNotNull(br); assertEquals(rnum, (int) te.ord()); assertEquals(s, te.term().utf8ToString()); } } }
From source file:org.apache.solr.request.UnInvertedField.java
License:Apache License
public NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, Integer mincount, boolean missing, String sort, String prefix) throws IOException { use.incrementAndGet();/*from www . j a va2 s.com*/ FieldType ft = searcher.getSchema().getFieldType(field); NamedList<Integer> res = new NamedList<Integer>(); // order is important DocSet docs = baseDocs; int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); //System.out.println("GET COUNTS field=" + field + " baseSize=" + baseSize + " minCount=" + mincount + " maxDoc=" + maxDoc + " numTermsInField=" + numTermsInField); if (baseSize >= mincount) { final int[] index = this.index; // tricky: we add more more element than we need because we will reuse this array later // for ordering term ords before converting to term labels. final int[] counts = new int[numTermsInField + 1]; // // If there is prefix, find it's start and end term numbers // int startTerm = 0; int endTerm = numTermsInField; // one past the end TermsEnum te = getOrdTermsEnum(searcher.getAtomicReader()); if (te != null && prefix != null && prefix.length() > 0) { final BytesRef prefixBr = new BytesRef(prefix); if (te.seekCeil(prefixBr) == TermsEnum.SeekStatus.END) { startTerm = numTermsInField; } else { startTerm = (int) te.ord(); } prefixBr.append(UnicodeUtil.BIG_TERM); if (te.seekCeil(prefixBr) == TermsEnum.SeekStatus.END) { endTerm = numTermsInField; } else { endTerm = (int) te.ord(); } } /*********** // Alternative 2: get the docSet of the prefix (could take a while) and // then do the intersection with the baseDocSet first. if (prefix != null && prefix.length() > 0) { docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs); // The issue with this method are problems of returning 0 counts for terms w/o // the prefix. We can't just filter out those terms later because it may // mean that we didn't collect enough terms in the queue (in the sorted case). } ***********/ boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && startTerm == 0 && endTerm == numTermsInField && docs instanceof BitDocSet; if (doNegative) { OpenBitSet bs = (OpenBitSet) ((BitDocSet) docs).getBits().clone(); bs.flip(0, maxDoc); // TODO: when iterator across negative elements is available, use that // instead of creating a new bitset and inverting. docs = new BitDocSet(bs, maxDoc - baseSize); // simply negating will mean that we have deleted docs in the set. // that should be OK, as their entries in our table should be empty. //System.out.println(" NEG"); } // For the biggest terms, do straight set intersections for (TopTerm tt : bigTerms.values()) { //System.out.println(" do big termNum=" + tt.termNum + " term=" + tt.term.utf8ToString()); // TODO: counts could be deferred if sorted==false if (tt.termNum >= startTerm && tt.termNum < endTerm) { counts[tt.termNum] = searcher.numDocs(new TermQuery(new Term(field, tt.term)), docs); //System.out.println(" count=" + counts[tt.termNum]); } else { //System.out.println("SKIP term=" + tt.termNum); } } // TODO: we could short-circuit counting altogether for sorted faceting // where we already have enough terms from the bigTerms // TODO: we could shrink the size of the collection array, and // additionally break when the termNumber got above endTerm, but // it would require two extra conditionals in the inner loop (although // they would be predictable for the non-prefix case). // Perhaps a different copy of the code would be warranted. if (termInstances > 0) { DocIterator iter = docs.iterator(); while (iter.hasNext()) { int doc = iter.nextDoc(); //System.out.println("iter doc=" + doc); int code = index[doc]; if ((code & 0xff) == 1) { //System.out.println(" ptr"); int pos = code >>> 8; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; int tnum = 0; for (;;) { int delta = 0; for (;;) { byte b = arr[pos++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; tnum += delta - TNUM_OFFSET; //System.out.println(" tnum=" + tnum); counts[tnum]++; } } else { //System.out.println(" inlined"); int tnum = 0; int delta = 0; for (;;) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; //System.out.println(" tnum=" + tnum); counts[tnum]++; delta = 0; } code >>>= 8; } } } } final CharsRef charsRef = new CharsRef(); int off = offset; int lim = limit >= 0 ? limit : Integer.MAX_VALUE; if (sort.equals(FacetParams.FACET_SORT_COUNT) || sort.equals(FacetParams.FACET_SORT_COUNT_LEGACY)) { int maxsize = limit > 0 ? offset + limit : Integer.MAX_VALUE - 1; maxsize = Math.min(maxsize, numTermsInField); LongPriorityQueue queue = new LongPriorityQueue(Math.min(maxsize, 1000), maxsize, Long.MIN_VALUE); int min = mincount - 1; // the smallest value in the top 'N' values //System.out.println("START=" + startTerm + " END=" + endTerm); for (int i = startTerm; i < endTerm; i++) { int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i]; if (c > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). // smaller term numbers sort higher, so subtract the term number instead long pair = (((long) c) << 32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); if (displaced) min = (int) (queue.top() >>> 32); } } // now select the right page from the results // if we are deep paging, we don't have to order the highest "offset" counts. int collectCount = Math.max(0, queue.size() - off); assert collectCount <= lim; // the start and end indexes of our list "sorted" (starting with the highest value) int sortedIdxStart = queue.size() - (collectCount - 1); int sortedIdxEnd = queue.size() + 1; final long[] sorted = queue.sort(collectCount); final int[] indirect = counts; // reuse the counts array for the index into the tnums array assert indirect.length >= sortedIdxEnd; for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { long pair = sorted[i]; int c = (int) (pair >>> 32); int tnum = Integer.MAX_VALUE - (int) pair; indirect[i] = i; // store the index for indirect sorting sorted[i] = tnum; // reuse the "sorted" array to store the term numbers for indirect sorting // add a null label for now... we'll fill it in later. res.add(null, c); } // now sort the indexes by the term numbers PrimUtils.sort(sortedIdxStart, sortedIdxEnd, indirect, new PrimUtils.IntComparator() { @Override public int compare(int a, int b) { return (int) sorted[a] - (int) sorted[b]; } @Override public boolean lessThan(int a, int b) { return sorted[a] < sorted[b]; } @Override public boolean equals(int a, int b) { return sorted[a] == sorted[b]; } }); // convert the term numbers to term values and set // as the label //System.out.println("sortStart=" + sortedIdxStart + " end=" + sortedIdxEnd); for (int i = sortedIdxStart; i < sortedIdxEnd; i++) { int idx = indirect[i]; int tnum = (int) sorted[idx]; final String label = getReadableValue(getTermValue(te, tnum), ft, charsRef); //System.out.println(" label=" + label); res.setName(idx - sortedIdxStart, label); } } else { // add results in index order int i = startTerm; if (mincount <= 0) { // if mincount<=0, then we won't discard any terms and we know exactly // where to start. i = startTerm + off; off = 0; } for (; i < endTerm; i++) { int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i]; if (c < mincount || --off >= 0) continue; if (--lim < 0) break; final String label = getReadableValue(getTermValue(te, i), ft, charsRef); res.add(label, c); } } } if (missing) { // TODO: a faster solution for this? res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field)); } //System.out.println(" res=" + res); return res; }
From source file:org.apache.solr.uninverting.TestDocTermOrds.java
License:Apache License
private void verify(LeafReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception { final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE, TestUtil.nextInt(random(), 2, 10)); final NumericDocValues docIDToID = FieldCache.DEFAULT.getNumerics(r, "id", FieldCache.LEGACY_INT_PARSER); /*/*from w w w. j a v a 2 s.c o m*/ for(int docID=0;docID<subR.maxDoc();docID++) { System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); } */ if (VERBOSE) { System.out.println("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString())); System.out.println("TEST: all TERMS:"); TermsEnum allTE = MultiFields.getTerms(r, "field").iterator(); int ord = 0; while (allTE.next() != null) { System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString()); } } //final TermsEnum te = subR.fields().terms("field").iterator(); final TermsEnum te = dto.getOrdTermsEnum(r); if (dto.numTerms() == 0) { if (prefixRef == null) { assertNull(MultiFields.getTerms(r, "field")); } else { Terms terms = MultiFields.getTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.iterator(); TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { assertFalse( "term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef)); } else { // ok } } else { // ok } } return; } if (VERBOSE) { System.out.println("TEST: TERMS:"); te.seekExact(0); while (true) { System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString()); if (te.next() == null) { break; } } } SortedSetDocValues iter = dto.iterator(r); for (int docID = 0; docID < r.maxDoc(); docID++) { assertEquals(docID, docIDToID.nextDoc()); if (docID > iter.docID()) { iter.nextDoc(); } if (docID < iter.docID()) { int[] answers = idToOrds[(int) docIDToID.longValue()]; assertEquals(0, answers.length); continue; } if (VERBOSE) { System.out.println( "TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.longValue() + ")"); } final int[] answers = idToOrds[(int) docIDToID.longValue()]; int upto = 0; long ord; while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.seekExact(ord); final BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { System.out.println(" exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString()); } assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + ord, expected, te.term()); } assertEquals(answers.length, upto); } }
From source file:org.apache.solr.uninverting.TestDocTermOrds.java
License:Apache License
public void testSortedTermsEnum() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); iwconfig.setMergePolicy(newLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); Document doc = new Document(); doc.add(new StringField("field", "hello", Field.Store.NO)); iwriter.addDocument(doc);// ww w .j a v a 2s. co m doc = new Document(); doc.add(new StringField("field", "world", Field.Store.NO)); // we need a second value for a doc, or we don't actually test DocTermOrds! doc.add(new StringField("field", "hello", Field.Store.NO)); iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("field", "beer", Field.Store.NO)); iwriter.addDocument(doc); iwriter.forceMerge(1); DirectoryReader ireader = iwriter.getReader(); iwriter.close(); LeafReader ar = getOnlyLeafReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.getDocTermOrds(ar, "field", null); assertEquals(3, dv.getValueCount()); TermsEnum termsEnum = dv.termsEnum(); // next() assertEquals("beer", termsEnum.next().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals("hello", termsEnum.next().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals("world", termsEnum.next().utf8ToString()); assertEquals(2, termsEnum.ord()); // seekCeil() assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("ha!"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("zzz"))); // seekExact() assertTrue(termsEnum.seekExact(new BytesRef("beer"))); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("hello"))); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); assertTrue(termsEnum.seekExact(new BytesRef("world"))); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); assertFalse(termsEnum.seekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.seekExact(0); assertEquals("beer", termsEnum.term().utf8ToString()); assertEquals(0, termsEnum.ord()); termsEnum.seekExact(1); assertEquals("hello", termsEnum.term().utf8ToString()); assertEquals(1, termsEnum.ord()); termsEnum.seekExact(2); assertEquals("world", termsEnum.term().utf8ToString()); assertEquals(2, termsEnum.ord()); // lookupTerm(BytesRef) assertEquals(-1, dv.lookupTerm(new BytesRef("apple"))); assertEquals(0, dv.lookupTerm(new BytesRef("beer"))); assertEquals(-2, dv.lookupTerm(new BytesRef("car"))); assertEquals(1, dv.lookupTerm(new BytesRef("hello"))); assertEquals(-3, dv.lookupTerm(new BytesRef("matter"))); assertEquals(2, dv.lookupTerm(new BytesRef("world"))); assertEquals(-4, dv.lookupTerm(new BytesRef("zany"))); ireader.close(); directory.close(); }
From source file:org.apache.solr.uninverting.TestFieldCacheVsDocValues.java
License:Apache License
private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception { BytesRef ref;//from w w w. ja va 2s . c o m // sequential next() through all terms while ((ref = expected.next()) != null) { assertEquals(ref, actual.next()); assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } assertNull(actual.next()); // sequential seekExact(ord) through all terms for (long i = 0; i < numOrds; i++) { expected.seekExact(i); actual.seekExact(i); assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // sequential seekExact(BytesRef) through all terms for (long i = 0; i < numOrds; i++) { expected.seekExact(i); assertTrue(actual.seekExact(expected.term())); assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // sequential seekCeil(BytesRef) through all terms for (long i = 0; i < numOrds; i++) { expected.seekExact(i); assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term())); assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // random seekExact(ord) for (long i = 0; i < numOrds; i++) { long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1); expected.seekExact(randomOrd); actual.seekExact(randomOrd); assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // random seekExact(BytesRef) for (long i = 0; i < numOrds; i++) { long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1); expected.seekExact(randomOrd); actual.seekExact(expected.term()); assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } // random seekCeil(BytesRef) for (long i = 0; i < numOrds; i++) { BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random())); SeekStatus expectedStatus = expected.seekCeil(target); assertEquals(expectedStatus, actual.seekCeil(target)); if (expectedStatus != SeekStatus.END) { assertEquals(expected.ord(), actual.ord()); assertEquals(expected.term(), actual.term()); } } }
From source file:suonos.lucene.fields.IndexedFieldCountsBuilder.java
License:Apache License
public IndexedFieldCountsBuilder addField(String fieldName, String filter) throws IOException { final IndexedField fld = models.indexedField(fieldName); final Map<String, IndexedFieldTermCount> valuesMap = AntLib.newHashMap(); final TIntIntHashMap ordCounts = new TIntIntHashMap(); if (filter != null) { filter = filter.toLowerCase();//from w w w .j av a 2 s . c om } // Get count of segments. // int sz = ir.leaves().size(); for (int i = 0; i != sz; i++) { // Get the segment reader. // LeafReader lr = ir.leaves().get(i).reader(); // Doc count for field. Eg "album_genres" // lr.getDocCount(fld.getName()); // Get all documents that have the field "album_genres" // Bits docs = lr.getDocsWithField(fld.getName()); ordCounts.clear(); // Enumerate the field terms. // if (fld.isDocValues()) { if (fld.isMultiValue()) { // docvalues & multivalue is a SortedSetDocValues // Per-Document values in a SortedDocValues are // deduplicated, dereferenced, and sorted into a dictionary // of // unique values. A pointer to the dictionary value // (ordinal) can be retrieved for each document. // Ordinals are dense and in increasing sorted order. // SortedSetDocValues set = lr.getSortedSetDocValues(fld.getName()); if (set != null) { // For all documents that have the field "album_genres": // for (int docId = 0; docId != docs.length(); docId++) { if (docs.get(docId)) { // Enumerate the set of [terms] of // "album_genres" for the document represented // by docId. // Each ord represents the term value. // set.setDocument(docId); // For each term bump up the frequency. // long ord; while ((ord = set.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { ordCounts.adjustOrPutValue((int) ord, 1, 1); System.out.println("term=" + set.lookupOrd(ord).utf8ToString()); } } } TermsEnum te = set.termsEnum(); BytesRef term; while ((term = te.next()) != null) { int ord = (int) te.ord(); add(fld, valuesMap, filter, term, ordCounts.get(ord)); } } } else { SortedDocValues set = lr.getSortedDocValues(fld.getName()); if (set != null) { // For all documents that have the field "album_genres": // for (int docId = 0; docId != docs.length(); docId++) { if (docs.get(docId)) { // Get the term - Classical, Rock, etc. // BytesRef term = set.get(docId); add(fld, valuesMap, filter, term, 1); } } } } } else { // Normal field, not a doc value. // Terms terms = lr.terms(fld.getName()); TermsEnum te = terms.iterator(); BytesRef term; while ((term = te.next()) != null) { add(fld, valuesMap, filter, term, te.docFreq()); } } /* * SORTED doc[0] = "aardvark" doc[1] = "beaver" doc[2] = "aardvark" * * doc[0] = 0 doc[1] = 1 doc[2] = 0 * * term[0] = "aardvark" term[1] = "beaver" */ // http://127.0.0.1:8080/api/facets?fields=track_title_a // the above should return B:(4) because titles starting with B are // 4! } // Get the array of term counters. // IndexedFieldTermCount[] list = valuesMap.values().toArray(new IndexedFieldTermCount[0]); // Sort by term. // Arrays.sort(list); // add to the map. // this.fieldCounts.put(fld.getName(), list); return this; }