List of usage examples for org.apache.lucene.index LeafReader terms
public abstract Terms terms(String field) throws IOException;
From source file:org.apache.solr.uninverting.DocTermOrds.java
License:Apache License
/** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); }//ww w . j a v a 2s. c o m //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number final int[] lastTerm = new int[maxDoc]; // last term we saw for this document final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List<BytesRef> indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (;;) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (;;) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val >>> 8; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass=" + pass + " process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val >>> 8; //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new IllegalStateException( "Too many values for UnInvertedField faceting on field " + field); } byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; /*** we don't have to worry about the array getting too large * since the "pos" param will overflow first (only 24 bits available) if ((newlen<<1) <= 0) { // overflow... newlen = Integer.MAX_VALUE; if (newlen <= pos + len) { throw new SolrException(400,"Too many terms to uninvert field!"); } } else { while (newlen <= pos + len) newlen<<=1; // doubling strategy } ****/ while (newlen <= pos + len) newlen <<= 1; // doubling strategy byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException { // not a general purpose filtering mechanism... assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX; SortedSetDocValues dv = reader.getSortedSetDocValues(field); if (dv != null) { return dv; }//from ww w . j a va 2s . c o m SortedDocValues sdv = reader.getSortedDocValues(field); if (sdv != null) { return DocValues.singleton(sdv); } final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptySortedSet(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptySortedSet(); } // ok we need to uninvert. check if we can optimize a bit. Terms terms = reader.terms(field); if (terms == null) { return DocValues.emptySortedSet(); } else { // if #postings = #docswithfield we know that the field is "single valued enough". // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency. // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf long numPostings = terms.getSumDocFreq(); if (numPostings != -1 && numPostings == terms.getDocCount()) { return DocValues.singleton(getTermsIndex(reader, field)); } } DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix)); return dto.iterator(reader); }
From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java
License:Apache License
private void execute(Path inputFile, Path commonTokensFile) throws Exception { Path luceneDir = Files.createTempDirectory("tika-eval-lucene-"); AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N); try {/*w w w . j a va 2 s . c o m*/ Directory directory = FSDirectory.open(luceneDir); AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1); Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); int maxLen = 1000000; int len = 0; try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) { List<Document> docs = new ArrayList<>(); try (BufferedReader reader = getReader(inputFile)) { String line = reader.readLine(); while (line != null) { len += line.length(); Document document = new Document(); document.add(new TextField(FIELD, line, Field.Store.NO)); docs.add(document); if (len > maxLen) { writer.addDocuments(docs); docs.clear(); len = 0; } line = reader.readLine(); } } if (docs.size() > 0) { writer.addDocuments(docs); } writer.commit(); writer.flush(); } try (IndexReader reader = DirectoryReader.open(directory)) { LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader); Terms terms = wrappedReader.terms(FIELD); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = wrappedReader.getDocCount(FIELD); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) { String t = bytesRef.utf8ToString(); if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } } } finally { FileUtils.deleteDirectory(luceneDir.toFile()); } writeTopN(commonTokensFile, queue); }
From source file:org.codelibs.elasticsearch.search.slice.TermsSliceQuery.java
License:Apache License
/** * Returns a DocIdSet per segments containing the matching docs for the specified slice. *//* ww w. j a va2s. c om*/ private DocIdSet build(LeafReader reader) throws IOException { final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc()); final Terms terms = reader.terms(getField()); final TermsEnum te = terms.iterator(); PostingsEnum docsEnum = null; for (BytesRef term = te.next(); term != null; term = te.next()) { int hashCode = term.hashCode(); if (contains(hashCode)) { docsEnum = te.postings(docsEnum, PostingsEnum.NONE); builder.add(docsEnum); } } return builder.build(); }
From source file:org.elasticsearch.common.lucene.uid.PerThreadIDVersionAndSeqNoLookup.java
License:Apache License
/** * Initialize lookup for the provided segment *///w ww.j a va2s .c om PerThreadIDVersionAndSeqNoLookup(LeafReader reader, String uidField) throws IOException { this.uidField = uidField; Terms terms = reader.terms(uidField); if (terms == null) { throw new IllegalArgumentException("reader misses the [" + uidField + "] field"); } termsEnum = terms.iterator(); if (reader.getNumericDocValues(VersionFieldMapper.NAME) == null) { throw new IllegalArgumentException("reader misses the [" + VersionFieldMapper.NAME + "] field"); } Object readerKey = null; assert (readerKey = reader.getCoreCacheHelper().getKey()) != null; this.readerKey = readerKey; }
From source file:org.elasticsearch.index.fielddata.plain.GeoPointArrayIndexFieldData.java
License:Apache License
@Override public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception { LeafReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); AtomicGeoPointFieldData data = null; // TODO: Use an actual estimator to estimate before loading. NonEstimatingEstimator estimator = new NonEstimatingEstimator( breakerService.getBreaker(CircuitBreaker.FIELDDATA)); if (terms == null) { data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc()); estimator.afterLoad(null, data.ramBytesUsed()); return data; }/*from ww w.j a v a2s.co m*/ return (Version.indexCreated(indexSettings).before(Version.V_2_2_0)) ? loadLegacyFieldData(reader, estimator, terms, data) : loadFieldData22(reader, estimator, terms, data); }
From source file:org.elasticsearch.index.mapper.BooleanFieldMapperTests.java
License:Apache License
public void testDefaults() throws IOException { String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties") .startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject() .string();//from w ww . j ava 2 s. c o m DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping)); ParsedDocument doc = defaultMapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes()); try (Directory dir = new RAMDirectory(); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) { w.addDocuments(doc.docs()); try (DirectoryReader reader = DirectoryReader.open(w)) { final LeafReader leaf = reader.leaves().get(0).reader(); // boolean fields are indexed and have doc values by default assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next()); SortedNumericDocValues values = leaf.getSortedNumericDocValues("field"); assertNotNull(values); values.setDocument(0); assertEquals(1, values.count()); assertEquals(1, values.valueAt(0)); } } }
From source file:org.elasticsearch.index.mapper.core.BooleanFieldMapperTests.java
License:Apache License
public void testDefaults() throws IOException { String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties") .startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject() .string();/* ww w. j ava2s . co m*/ DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping)); ParsedDocument doc = defaultMapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes()); try (Directory dir = new RAMDirectory(); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(getRandom())))) { w.addDocuments(doc.docs()); try (DirectoryReader reader = DirectoryReader.open(w, true)) { final LeafReader leaf = reader.leaves().get(0).reader(); // boolean fields are indexed and have doc values by default assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next()); SortedNumericDocValues values = leaf.getSortedNumericDocValues("field"); assertNotNull(values); values.setDocument(0); assertEquals(1, values.count()); assertEquals(1, values.valueAt(0)); } } }
From source file:org.elasticsearch.index.mapper.core.TextFieldMapperTests.java
License:Apache License
public void testDefaultPositionIncrementGap() throws IOException { String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties") .startObject("field").field("type", "text").endObject().endObject().endObject().endObject() .string();//from w w w . j av a 2s . c om DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, false); assertEquals(mapping, mapper.mappingSource().toString()); ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject() .field("field", new String[] { "a", "b" }).endObject().bytes()); IndexableField[] fields = doc.rootDoc().getFields("field"); assertEquals(2, fields.length); assertEquals("a", fields[0].stringValue()); assertEquals("b", fields[1].stringValue()); IndexShard shard = indexService.getShard(0); shard.index(new Engine.Index(new Term("_uid", "1"), doc)); shard.refresh("test"); try (Engine.Searcher searcher = shard.acquireSearcher("test")) { LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); TermsEnum terms = leaf.terms("field").iterator(); assertTrue(terms.seekExact(new BytesRef("b"))); PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); assertEquals(0, postings.nextDoc()); assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition()); } }
From source file:org.elasticsearch.index.mapper.core.TextFieldMapperTests.java
License:Apache License
public void testPositionIncrementGap() throws IOException { final int positionIncrementGap = randomIntBetween(1, 1000); String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties") .startObject("field").field("type", "text").field("position_increment_gap", positionIncrementGap) .endObject().endObject().endObject().endObject().string(); DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, false); assertEquals(mapping, mapper.mappingSource().toString()); ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject() .field("field", new String[] { "a", "b" }).endObject().bytes()); IndexableField[] fields = doc.rootDoc().getFields("field"); assertEquals(2, fields.length);//from w ww .j a v a 2s.com assertEquals("a", fields[0].stringValue()); assertEquals("b", fields[1].stringValue()); IndexShard shard = indexService.getShard(0); shard.index(new Engine.Index(new Term("_uid", "1"), doc)); shard.refresh("test"); try (Engine.Searcher searcher = shard.acquireSearcher("test")) { LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader(); TermsEnum terms = leaf.terms("field").iterator(); assertTrue(terms.seekExact(new BytesRef("b"))); PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS); assertEquals(0, postings.nextDoc()); assertEquals(positionIncrementGap + 1, postings.nextPosition()); } }