List of usage examples for org.apache.lucene.index LeafReader getFieldInfos
public abstract FieldInfos getFieldInfos();
From source file:org.apache.solr.uninverting.DocTermOrds.java
License:Apache License
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); }//from ww w . j a v a 2 s. com //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (;;) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (;;) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass=" + pass + " process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /*** we don't have to worry about the array getting too large * since the "pos" param will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen <= pos + len) newlen<<=1; // doubling strategy } ****/ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
@Override public Bits getDocsWithField(LeafReader reader, String field, Parser parser) throws IOException { final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo == null) { // field does not exist or has no value return new Bits.MatchNoBits(reader.maxDoc()); }/*w w w . j a va2 s. c om*/ if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { // doc values case } else if (parser instanceof PointParser) { // points case } else { // postings case if (fieldInfo.getIndexOptions() == IndexOptions.NONE) { return new Bits.MatchNoBits(reader.maxDoc()); } } BitsEntry bitsEntry = (BitsEntry) caches.get(DocsWithFieldCache.class).get(reader, new CacheKey(field, parser)); return bitsEntry.bits; }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
@Override public NumericDocValues getNumerics(LeafReader reader, String field, Parser parser) throws IOException { if (parser == null) { throw new NullPointerException(); }// ww w . j a v a2s.co m final NumericDocValues valuesIn = reader.getNumericDocValues(field); if (valuesIn != null) { return valuesIn; } else { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptyNumeric(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } if (parser instanceof PointParser) { // points case // no points in this segment if (info.getPointDimensionCount() == 0) { return DocValues.emptyNumeric(); } if (info.getPointDimensionCount() != 1) { throw new IllegalStateException("Type mismatch: " + field + " was indexed with dimensions=" + info.getPointDimensionCount()); } PointValues values = reader.getPointValues(field); // no actual points for this field (e.g. all points deleted) if (values == null || values.size() == 0) { return DocValues.emptyNumeric(); } // not single-valued if (values.size() != values.getDocCount()) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed with multiple values, numValues=" + values.size() + ",numDocs=" + values.getDocCount()); } } else { // postings case // not indexed if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptyNumeric(); } } return ((LongsFromArray) caches.get(Long.TYPE).get(reader, new CacheKey(field, parser))).iterator(); } }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
public SortedDocValues getTermsIndex(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException { SortedDocValues valuesIn = reader.getSortedDocValues(field); if (valuesIn != null) { // Not cached here by FieldCacheImpl (cached instead // per-thread by SegmentReader): return valuesIn; } else {//w ww .jav a 2s .co m final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptySorted(); } else if (info.getDocValuesType() != DocValuesType.NONE) { // we don't try to build a sorted instance from numeric/binary doc // values because dedup can be very costly throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptySorted(); } SortedDocValuesImpl impl = (SortedDocValuesImpl) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio)); return impl.iterator(); } }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
public BinaryDocValues getTerms(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException { BinaryDocValues valuesIn = reader.getBinaryDocValues(field); if (valuesIn == null) { valuesIn = reader.getSortedDocValues(field); }/* w w w. jav a 2s . com*/ if (valuesIn != null) { // Not cached here by FieldCacheImpl (cached instead // per-thread by SegmentReader): return valuesIn; } final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptyBinary(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptyBinary(); } BinaryDocValuesImpl impl = (BinaryDocValuesImpl) caches.get(BinaryDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio)); return impl.iterator(); }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException { // not a general purpose filtering mechanism... assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX; SortedSetDocValues dv = reader.getSortedSetDocValues(field); if (dv != null) { return dv; }//www . j av a 2s. c om SortedDocValues sdv = reader.getSortedDocValues(field); if (sdv != null) { return DocValues.singleton(sdv); } final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptySortedSet(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptySortedSet(); } // ok we need to uninvert. check if we can optimize a bit. Terms terms = reader.terms(field); if (terms == null) { return DocValues.emptySortedSet(); } else { // if #postings = #docswithfield we know that the field is "single valued enough". // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency. // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf long numPostings = terms.getSumDocFreq(); if (numPostings != -1 && numPostings == terms.getDocCount()) { return DocValues.singleton(getTermsIndex(reader, field)); } } DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix)); return dto.iterator(reader); }
From source file:org.apache.solr.uninverting.TestUninvertingReader.java
License:Apache License
public void testFieldInfos() throws IOException { Directory dir = newDirectory();/*ww w. ja va2 s. c om*/ IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); BytesRef idBytes = new BytesRef("id"); doc.add(new StringField("id", idBytes, Store.YES)); doc.add(new LegacyIntField("int", 5, Store.YES)); doc.add(new NumericDocValuesField("dv", 5)); doc.add(new IntPoint("dint", 5)); doc.add(new StoredField("stored", 5)); // not indexed iw.addDocument(doc); iw.forceMerge(1); iw.close(); Map<String, Type> uninvertingMap = new HashMap<>(); uninvertingMap.put("int", Type.LEGACY_INTEGER); uninvertingMap.put("dv", Type.LEGACY_INTEGER); uninvertingMap.put("dint", Type.INTEGER_POINT); DirectoryReader ir = UninvertingReader.wrap(DirectoryReader.open(dir), uninvertingMap); LeafReader leafReader = ir.leaves().get(0).reader(); FieldInfo intFInfo = leafReader.getFieldInfos().fieldInfo("int"); assertEquals(DocValuesType.NUMERIC, intFInfo.getDocValuesType()); assertEquals(0, intFInfo.getPointDimensionCount()); assertEquals(0, intFInfo.getPointNumBytes()); FieldInfo dintFInfo = leafReader.getFieldInfos().fieldInfo("dint"); assertEquals(DocValuesType.NUMERIC, dintFInfo.getDocValuesType()); assertEquals(1, dintFInfo.getPointDimensionCount()); assertEquals(4, dintFInfo.getPointNumBytes()); FieldInfo dvFInfo = leafReader.getFieldInfos().fieldInfo("dv"); assertEquals(DocValuesType.NUMERIC, dvFInfo.getDocValuesType()); FieldInfo storedFInfo = leafReader.getFieldInfos().fieldInfo("stored"); assertEquals(DocValuesType.NONE, storedFInfo.getDocValuesType()); TestUtil.checkReader(ir); ir.close(); dir.close(); }
From source file:org.apache.solr.uninverting.UninvertingReader.java
License:Apache License
/** * Create a new UninvertingReader with the specified mapping * <p>// ww w .j a va 2s. c o m * Expert: This should almost never be used. Use {@link #wrap(DirectoryReader, Map)} * instead. * * @lucene.internal */ public UninvertingReader(LeafReader in, Map<String, Type> mapping) { super(in); this.mapping = mapping; ArrayList<FieldInfo> filteredInfos = new ArrayList<>(); for (FieldInfo fi : in.getFieldInfos()) { DocValuesType type = fi.getDocValuesType(); if (type == DocValuesType.NONE) { Type t = mapping.get(fi.name); if (t != null) { if (t == Type.INTEGER_POINT || t == Type.LONG_POINT || t == Type.FLOAT_POINT || t == Type.DOUBLE_POINT) { // type uses points if (fi.getPointDimensionCount() == 0) { continue; } } else { // type uses inverted index if (fi.getIndexOptions() == IndexOptions.NONE) { continue; } } switch (t) { case INTEGER_POINT: case LONG_POINT: case FLOAT_POINT: case DOUBLE_POINT: case LEGACY_INTEGER: case LEGACY_LONG: case LEGACY_FLOAT: case LEGACY_DOUBLE: type = DocValuesType.NUMERIC; break; case BINARY: type = DocValuesType.BINARY; break; case SORTED: type = DocValuesType.SORTED; break; case SORTED_SET_BINARY: case SORTED_SET_INTEGER: case SORTED_SET_FLOAT: case SORTED_SET_LONG: case SORTED_SET_DOUBLE: type = DocValuesType.SORTED_SET; break; default: throw new AssertionError(); } } } filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), fi.getPointDimensionCount(), fi.getPointNumBytes())); } fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()])); }
From source file:org.apache.tika.eval.tools.SlowCompositeReaderWrapper.java
License:Apache License
@Override public SortedDocValues getSortedDocValues(String field) throws IOException { ensureOpen();//from ww w . j a v a 2 s. c o m OrdinalMap map = null; synchronized (cachedOrdMaps) { map = cachedOrdMaps.get(field); if (map == null) { // uncached, or not a multi dv SortedDocValues dv = MultiDocValues.getSortedValues(in, field); if (dv instanceof MultiSortedDocValues) { map = ((MultiSortedDocValues) dv).mapping; IndexReader.CacheHelper cacheHelper = getReaderCacheHelper(); if (cacheHelper != null && map.owner == cacheHelper.getKey()) { cachedOrdMaps.put(field, map); } } return dv; } } int size = in.leaves().size(); final SortedDocValues[] values = new SortedDocValues[size]; final int[] starts = new int[size + 1]; long totalCost = 0; for (int i = 0; i < size; i++) { LeafReaderContext context = in.leaves().get(i); final LeafReader reader = context.reader(); final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED) { return null; } SortedDocValues v = reader.getSortedDocValues(field); if (v == null) { v = DocValues.emptySorted(); } totalCost += v.cost(); values[i] = v; starts[i] = context.docBase; } starts[size] = maxDoc(); return new MultiSortedDocValues(values, starts, map, totalCost); }
From source file:org.apache.tika.eval.tools.SlowCompositeReaderWrapper.java
License:Apache License
@Override public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { ensureOpen();//from www .java2 s. co m OrdinalMap map = null; synchronized (cachedOrdMaps) { map = cachedOrdMaps.get(field); if (map == null) { // uncached, or not a multi dv SortedSetDocValues dv = MultiDocValues.getSortedSetValues(in, field); if (dv instanceof MultiDocValues.MultiSortedSetDocValues) { map = ((MultiDocValues.MultiSortedSetDocValues) dv).mapping; IndexReader.CacheHelper cacheHelper = getReaderCacheHelper(); if (cacheHelper != null && map.owner == cacheHelper.getKey()) { cachedOrdMaps.put(field, map); } } return dv; } } assert map != null; int size = in.leaves().size(); final SortedSetDocValues[] values = new SortedSetDocValues[size]; final int[] starts = new int[size + 1]; long cost = 0; for (int i = 0; i < size; i++) { LeafReaderContext context = in.leaves().get(i); final LeafReader reader = context.reader(); final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET) { return null; } SortedSetDocValues v = reader.getSortedSetDocValues(field); if (v == null) { v = DocValues.emptySortedSet(); } values[i] = v; starts[i] = context.docBase; cost += v.cost(); } starts[size] = maxDoc(); return new MultiDocValues.MultiSortedSetDocValues(values, starts, map, cost); }