List of usage examples for org.apache.lucene.index TermsEnum postings
public abstract PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException;
From source file:com.github.flaxsearch.resources.PositionsResource.java
License:Apache License
@GET public DocTermData getDocTermData(@QueryParam("segment") Integer segment, @PathParam("field") String field, @PathParam("term") String term, @PathParam("docId") int docId) throws Exception { TermsEnum te = readerManager.findTermPostings(segment, field, term); PostingsEnum pe = te.postings(null, PostingsEnum.ALL); if (pe.advance(docId) != docId) { String seg = segment == null ? "" : " in segment " + segment; String msg = String.format(Locale.ROOT, "No document %d%s in index", docId, seg); throw new WebApplicationException(msg, Response.Status.NOT_FOUND); }/*from www . ja v a2s.c o m*/ List<PositionData> positions = new ArrayList<>(); int remaining = pe.freq(); while (remaining > 0) { remaining--; positions.add(new PositionData(pe)); } return new DocTermData(docId, positions); }
From source file:com.github.flaxsearch.resources.PostingsResource.java
License:Apache License
@GET public TermData getPostings(@QueryParam("segment") Integer segment, @PathParam("field") String field, @PathParam("term") String term, @QueryParam("count") @DefaultValue("2147483647") int count) throws IOException { TermsEnum te = readerManager.findTermPostings(segment, field, term); Bits liveDocs = readerManager.getLiveDocs(segment); PostingsEnum pe = te.postings(null, PostingsEnum.NONE); int docFreq = te.docFreq(); long totalTermFreq = te.totalTermFreq(); int size = (docFreq < count) ? docFreq : count; int[] postings = new int[size]; int docId;//from ww w . j a va2 s . com int i = 0; while ((docId = pe.nextDoc()) != PostingsEnum.NO_MORE_DOCS && i < count) { if (liveDocs != null && liveDocs.get(docId) == false) continue; postings[i] = docId; i++; } return new TermData(term, docFreq, totalTermFreq, postings); }
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();//from ww w .j a v a 2 s . c om MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks the terms enum sequentially//from www . j a v a2s . c om * if deep is false, it does a 'shallow' test that doesnt go down to the docsenums */ public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep, boolean hasPositions) throws Exception { BytesRef term; PostingsEnum leftPositions = null; PostingsEnum rightPositions = null; PostingsEnum leftDocs = null; PostingsEnum rightDocs = null; while ((term = leftTermsEnum.next()) != null) { assertEquals(term, rightTermsEnum.next()); assertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { if (hasPositions) { // with payloads + off assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL)); // with payloads only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS)); // with offsets only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS)); // with positions only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS)); } // with freqs: assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs)); // w/o freqs: assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE)); // with freqs: assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs)); // w/o freqs: assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE)); } } assertNull(rightTermsEnum.next()); }
From source file:com.shaie.utils.IndexUtils.java
License:Apache License
/** Prints the terms indexed under the given fields with full postings information. */ public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException { for (final String field : fields) { System.out.println(format("Terms for field [%s], with positional info:", field)); final TermsEnum te = reader.terms(field).iterator(); BytesRef scratch;//from w w w.j a v a2 s . co m PostingsEnum postings = null; while ((scratch = te.next()) != null) { System.out.println(format(" %s", scratch.utf8ToString())); postings = te.postings(postings, PostingsEnum.ALL); for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) { final Map<Integer, BytesRef> positions = Maps.newTreeMap(); boolean addedPayload = false; for (int i = 0; i < postings.freq(); i++) { final int pos = postings.nextPosition(); final BytesRef payload = postings.getPayload(); if (payload != null) { positions.put(pos, BytesRef.deepCopyOf(payload)); addedPayload = true; } else { positions.put(pos, null); } } if (addedPayload) { System.out.println( format(" doc=%d, freq=%d", postings.docID(), postings.freq(), positions)); for (final Entry<Integer, BytesRef> e : positions.entrySet()) { System.out.println(format(" pos=%d, payload=%s", e.getKey(), e.getValue())); } } else { System.out.println(format(" doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(), positions.keySet())); } } } } }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
protected static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException { TermsEnum termsEnum = terms.iterator(); if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; }/*from w w w . j av a 2s. c om*/ PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); final Bits liveDocs = reader.getLiveDocs(); if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS || (liveDocs != null && liveDocs.get(postingsEnum.docID()))) { return null; } return reader.document(postingsEnum.docID()); }
From source file:org.apache.solr.search.DocSetBuilder.java
License:Apache License
/** Returns the number of terms visited */ public int add(TermsEnum te, int base) throws IOException { PostingsEnum postings = null;//ww w.j a v a 2 s . c o m int termCount = 0; for (;;) { BytesRef term = te.next(); if (term == null) break; termCount++; postings = te.postings(postings, PostingsEnum.NONE); add(postings, base); } return termCount; }
From source file:org.apache.solr.search.DocSetUtil.java
License:Apache License
public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException { DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead int maxDoc = searcher.getIndexReader().maxDoc(); int smallSetSize = smallSetSize(maxDoc); String field = term.field();// w ww . j a v a 2 s . com BytesRef termVal = term.bytes(); int maxCount = 0; int firstReader = -1; List<LeafReaderContext> leaves = reader.leaves(); PostingsEnum[] postList = new PostingsEnum[leaves.size()]; // use array for slightly higher scanning cost, but fewer memory allocations for (LeafReaderContext ctx : leaves) { assert leaves.get(ctx.ord) == ctx; LeafReader r = ctx.reader(); Fields f = r.fields(); Terms t = f.terms(field); if (t == null) continue; // field is missing TermsEnum te = t.iterator(); if (te.seekExact(termVal)) { maxCount += te.docFreq(); postList[ctx.ord] = te.postings(null, PostingsEnum.NONE); if (firstReader < 0) firstReader = ctx.ord; } } DocSet answer = null; if (maxCount == 0) { answer = DocSet.EMPTY; } else if (maxCount <= smallSetSize) { answer = createSmallSet(leaves, postList, maxCount, firstReader); } else { answer = createBigSet(leaves, postList, maxDoc, firstReader); } return DocSetUtil.getDocSet(answer, searcher); }
From source file:org.apache.solr.search.FloatPayloadValueSource.java
License:Apache License
@Override public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException { Fields fields = readerContext.reader().fields(); final Terms terms = fields.terms(indexedField); FunctionValues defaultValues = defaultValueSource.getValues(context, readerContext); // copied the bulk of this from TFValueSource - TODO: this is a very repeated pattern - base-class this advance logic stuff? return new FloatDocValues(this) { PostingsEnum docs;/*from w ww . j a v a 2 s . co m*/ int atDoc; int lastDocRequested = -1; { reset(); } public void reset() throws IOException { // no one should call us for deleted docs? if (terms != null) { final TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(indexedBytes)) { docs = termsEnum.postings(null, PostingsEnum.ALL); } else { docs = null; } } else { docs = null; } if (docs == null) { // dummy PostingsEnum so floatVal() can work // when would this be called? if field/val did not match? this is called for every doc? create once and cache? docs = new PostingsEnum() { @Override public int freq() { return 0; } @Override public int nextPosition() throws IOException { return -1; } @Override public int startOffset() throws IOException { return -1; } @Override public int endOffset() throws IOException { return -1; } @Override public BytesRef getPayload() throws IOException { return null; } @Override public int docID() { return DocIdSetIterator.NO_MORE_DOCS; } @Override public int nextDoc() { return DocIdSetIterator.NO_MORE_DOCS; } @Override public int advance(int target) { return DocIdSetIterator.NO_MORE_DOCS; } @Override public long cost() { return 0; } }; } atDoc = -1; } @Override public float floatVal(int doc) { try { if (doc < lastDocRequested) { // out-of-order access.... reset reset(); } lastDocRequested = doc; if (atDoc < doc) { atDoc = docs.advance(doc); } if (atDoc > doc) { // term doesn't match this document... either because we hit the // end, or because the next doc is after this doc. return defaultValues.floatVal(doc); } // a match! int freq = docs.freq(); int numPayloadsSeen = 0; float currentScore = 0; for (int i = 0; i < freq; i++) { docs.nextPosition(); BytesRef payload = docs.getPayload(); if (payload != null) { float payloadVal = decoder.decode(atDoc, docs.startOffset(), docs.endOffset(), payload); // payloadFunction = null represents "first" if (payloadFunction == null) return payloadVal; currentScore = payloadFunction.currentScore(doc, indexedField, docs.startOffset(), docs.endOffset(), numPayloadsSeen, currentScore, payloadVal); numPayloadsSeen++; } } return (numPayloadsSeen > 0) ? payloadFunction.docScore(doc, indexedField, numPayloadsSeen, currentScore) : defaultValues.floatVal(doc); } catch (IOException e) { throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e); } } }; }
From source file:org.apache.solr.uninverting.DocTermOrds.java
License:Apache License
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); }/*from w w w . j a v a2 s. c o m*/ //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (;;) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (;;) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass=" + pass + " process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /*** we don't have to worry about the array getting too large * since the "pos" param will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen <= pos + len) newlen<<=1; // doubling strategy } ****/ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }