Example usage for org.apache.lucene.index LeafReader terms

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Returns the Terms index for this field, or null if it has none.

Usage

From source file:org.apache.solr.uninverting.DocTermOrds.java

License:Apache License

/** Call this only once (if you subclass!) */
protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException(
                "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }//ww w  .  j  a v a  2s.  c o m
    //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
        // No terms
        return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    //System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
        // No terms match
        return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (;;) {
        final BytesRef t = te.term();
        if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
            break;
        }
        //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

        visitTerm(te, termNum);

        if ((termNum & indexIntervalMask) == 0) {
            // Index this term
            sizeOfIndexedStrings += t.length;
            BytesRef indexedTerm = new BytesRef();
            indexedTermsBytes.copy(t, indexedTerm);
            // TODO: really should 1) strip off useless suffix,
            // and 2) use FST not array/PagedBytes
            indexedTerms.add(indexedTerm);
        }

        final int df = te.docFreq();
        if (df <= maxTermDocFreq) {

            postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

            // dF, but takes deletions into account
            int actualDF = 0;

            for (;;) {
                int doc = postingsEnum.nextDoc();
                if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                    break;
                }
                //System.out.println("  chunk=" + chunk + " docs");

                actualDF++;
                termInstances++;

                //System.out.println("    docID=" + doc);
                // add TNUM_OFFSET to the term number to make room for special reserved values:
                // 0 (end term) and 1 (index into byte array follows)
                int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                lastTerm[doc] = termNum;
                int val = index[doc];

                if ((val & 0xff) == 1) {
                    // index into byte array (actually the end of
                    // the doc-specific byte[] when building)
                    int pos = val >>> 8;
                    int ilen = vIntSize(delta);
                    byte[] arr = bytes[doc];
                    int newend = pos + ilen;
                    if (newend > arr.length) {
                        // We avoid a doubling strategy to lower memory usage.
                        // this faceting method isn't for docs with many terms.
                        // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                        // TODO: figure out what array lengths we can round up to w/o actually using more memory
                        // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                        // It should be safe to round up to the nearest 32 bits in any case.
                        int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
                        byte[] newarr = new byte[newLen];
                        System.arraycopy(arr, 0, newarr, 0, pos);
                        arr = newarr;
                        bytes[doc] = newarr;
                    }
                    pos = writeInt(delta, arr, pos);
                    index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                } else {
                    // OK, this int has data in it... find the end (a zero starting byte - not
                    // part of another number, hence not following a byte with the high bit set).
                    int ipos;
                    if (val == 0) {
                        ipos = 0;
                    } else if ((val & 0x0000ff80) == 0) {
                        ipos = 1;
                    } else if ((val & 0x00ff8000) == 0) {
                        ipos = 2;
                    } else if ((val & 0xff800000) == 0) {
                        ipos = 3;
                    } else {
                        ipos = 4;
                    }

                    //System.out.println("      ipos=" + ipos);

                    int endPos = writeInt(delta, tempArr, ipos);
                    //System.out.println("      endpos=" + endPos);
                    if (endPos <= 4) {
                        //System.out.println("      fits!");
                        // value will fit in the integer... move bytes back
                        for (int j = ipos; j < endPos; j++) {
                            val |= (tempArr[j] & 0xff) << (j << 3);
                        }
                        index[doc] = val;
                    } else {
                        // value won't fit... move integer into byte[]
                        for (int j = 0; j < ipos; j++) {
                            tempArr[j] = (byte) val;
                            val >>>= 8;
                        }
                        // point at the end index in the byte[]
                        index[doc] = (endPos << 8) | 1;
                        bytes[doc] = tempArr;
                        tempArr = new byte[12];
                    }
                }
            }
            setActualDocFreq(termNum, actualDF);
        }

        termNum++;
        if (te.next() == null) {
            break;
        }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
        // we didn't invert anything
        // lower memory consumption.
        tnums = null;
    } else {

        this.index = index;

        //
        // transform intermediate form into the final form, building a single byte[]
        // at a time, and releasing the intermediate byte[]s as we go to avoid
        // increasing the memory footprint.
        //

        for (int pass = 0; pass < 256; pass++) {
            byte[] target = tnums[pass];
            int pos = 0; // end in target;
            if (target != null) {
                pos = target.length;
            } else {
                target = new byte[4096];
            }

            // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
            // where pp is the pass (which array we are building), and xx is all values.
            // each pass shares the same byte[] for termNumber lists.
            for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
                int lim = Math.min(docbase + (1 << 16), maxDoc);
                for (int doc = docbase; doc < lim; doc++) {
                    //System.out.println("  pass=" + pass + " process docID=" + doc);
                    int val = index[doc];
                    if ((val & 0xff) == 1) {
                        int len = val >>> 8;
                        //System.out.println("    ptr pos=" + pos);
                        index[doc] = (pos << 8) | 1; // change index to point to start of array
                        if ((pos & 0xff000000) != 0) {
                            // we only have 24 bits for the array index
                            throw new IllegalStateException(
                                    "Too many values for UnInvertedField faceting on field " + field);
                        }
                        byte[] arr = bytes[doc];
                        /*
                        for(byte b : arr) {
                          //System.out.println("      b=" + Integer.toHexString((int) b));
                        }
                        */
                        bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                        if (target.length <= pos + len) {
                            int newlen = target.length;
                            /*** we don't have to worry about the array getting too large
                             * since the "pos" param will overflow first (only 24 bits available)
                            if ((newlen<<1) <= 0) {
                              // overflow...
                              newlen = Integer.MAX_VALUE;
                              if (newlen <= pos + len) {
                                throw new SolrException(400,"Too many terms to uninvert field!");
                              }
                            } else {
                              while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                            }
                            ****/
                            while (newlen <= pos + len)
                                newlen <<= 1; // doubling strategy                 
                            byte[] newtarget = new byte[newlen];
                            System.arraycopy(target, 0, newtarget, 0, pos);
                            target = newtarget;
                        }
                        System.arraycopy(arr, 0, target, pos, len);
                        pos += len + 1; // skip single byte at end and leave it 0 for terminator
                    }
                }
            }

            // shrink array
            if (pos < target.length) {
                byte[] newtarget = new byte[pos];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
            }

            tnums[pass] = target;

            if ((pass << 16) > maxDoc)
                break;
        }

    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
}

From source file:org.apache.solr.uninverting.FieldCacheImpl.java

License:Apache License

public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException {
    // not a general purpose filtering mechanism...
    assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;

    SortedSetDocValues dv = reader.getSortedSetDocValues(field);
    if (dv != null) {
        return dv;
    }//from ww w  .  j  a va  2s . c  o  m

    SortedDocValues sdv = reader.getSortedDocValues(field);
    if (sdv != null) {
        return DocValues.singleton(sdv);
    }

    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (info == null) {
        return DocValues.emptySortedSet();
    } else if (info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException(
                "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    } else if (info.getIndexOptions() == IndexOptions.NONE) {
        return DocValues.emptySortedSet();
    }

    // ok we need to uninvert. check if we can optimize a bit.

    Terms terms = reader.terms(field);
    if (terms == null) {
        return DocValues.emptySortedSet();
    } else {
        // if #postings = #docswithfield we know that the field is "single valued enough".
        // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
        // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
        long numPostings = terms.getSumDocFreq();
        if (numPostings != -1 && numPostings == terms.getDocCount()) {
            return DocValues.singleton(getTermsIndex(reader, field));
        }
    }

    DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix));
    return dto.iterator(reader);
}

From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java

License:Apache License

private void execute(Path inputFile, Path commonTokensFile) throws Exception {
    Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
    AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
    try {/*w  w  w  . j  a  va 2  s  .  c  o m*/
        Directory directory = FSDirectory.open(luceneDir);
        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

        Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        int maxLen = 1000000;
        int len = 0;
        try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
            List<Document> docs = new ArrayList<>();
            try (BufferedReader reader = getReader(inputFile)) {
                String line = reader.readLine();
                while (line != null) {
                    len += line.length();
                    Document document = new Document();
                    document.add(new TextField(FIELD, line, Field.Store.NO));
                    docs.add(document);
                    if (len > maxLen) {
                        writer.addDocuments(docs);
                        docs.clear();
                        len = 0;
                    }
                    line = reader.readLine();
                }
            }
            if (docs.size() > 0) {
                writer.addDocuments(docs);
            }
            writer.commit();
            writer.flush();
        }
        try (IndexReader reader = DirectoryReader.open(directory)) {
            LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
            Terms terms = wrappedReader.terms(FIELD);
            TermsEnum termsEnum = terms.iterator();
            BytesRef bytesRef = termsEnum.next();
            int docsWThisField = wrappedReader.getDocCount(FIELD);
            while (bytesRef != null) {
                int df = termsEnum.docFreq();
                long tf = termsEnum.totalTermFreq();
                if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                    bytesRef = termsEnum.next();
                    continue;
                }

                if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                    String t = bytesRef.utf8ToString();
                    if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) {
                        queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                    }

                }
                bytesRef = termsEnum.next();
            }
        }
    } finally {
        FileUtils.deleteDirectory(luceneDir.toFile());
    }

    writeTopN(commonTokensFile, queue);

}

From source file:org.codelibs.elasticsearch.search.slice.TermsSliceQuery.java

License:Apache License

/**
 * Returns a DocIdSet per segments containing the matching docs for the specified slice.
 *//* ww w.  j  a va2s. c om*/
private DocIdSet build(LeafReader reader) throws IOException {
    final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
    final Terms terms = reader.terms(getField());
    final TermsEnum te = terms.iterator();
    PostingsEnum docsEnum = null;
    for (BytesRef term = te.next(); term != null; term = te.next()) {
        int hashCode = term.hashCode();
        if (contains(hashCode)) {
            docsEnum = te.postings(docsEnum, PostingsEnum.NONE);
            builder.add(docsEnum);
        }
    }
    return builder.build();
}

From source file:org.elasticsearch.common.lucene.uid.PerThreadIDVersionAndSeqNoLookup.java

License:Apache License

/**
 * Initialize lookup for the provided segment
 *///w  ww.j a va2s .c  om
PerThreadIDVersionAndSeqNoLookup(LeafReader reader, String uidField) throws IOException {
    this.uidField = uidField;
    Terms terms = reader.terms(uidField);
    if (terms == null) {
        throw new IllegalArgumentException("reader misses the [" + uidField + "] field");
    }
    termsEnum = terms.iterator();
    if (reader.getNumericDocValues(VersionFieldMapper.NAME) == null) {
        throw new IllegalArgumentException("reader misses the [" + VersionFieldMapper.NAME + "] field");
    }

    Object readerKey = null;
    assert (readerKey = reader.getCoreCacheHelper().getKey()) != null;
    this.readerKey = readerKey;
}

From source file:org.elasticsearch.index.fielddata.plain.GeoPointArrayIndexFieldData.java

License:Apache License

@Override
public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception {
    LeafReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    AtomicGeoPointFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(
            breakerService.getBreaker(CircuitBreaker.FIELDDATA));
    if (terms == null) {
        data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.ramBytesUsed());
        return data;
    }/*from   ww  w.j  a  v a2s.co m*/
    return (Version.indexCreated(indexSettings).before(Version.V_2_2_0))
            ? loadLegacyFieldData(reader, estimator, terms, data)
            : loadFieldData22(reader, estimator, terms, data);
}

From source file:org.elasticsearch.index.mapper.BooleanFieldMapperTests.java

License:Apache License

public void testDefaults() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject()
            .string();//from   w ww  . j ava  2 s. c  o m

    DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping));

    ParsedDocument doc = defaultMapper.parse("test", "type", "1",
            XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes());

    try (Directory dir = new RAMDirectory();
            IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) {
        w.addDocuments(doc.docs());
        try (DirectoryReader reader = DirectoryReader.open(w)) {
            final LeafReader leaf = reader.leaves().get(0).reader();
            // boolean fields are indexed and have doc values by default
            assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next());
            SortedNumericDocValues values = leaf.getSortedNumericDocValues("field");
            assertNotNull(values);
            values.setDocument(0);
            assertEquals(1, values.count());
            assertEquals(1, values.valueAt(0));
        }
    }
}

From source file:org.elasticsearch.index.mapper.core.BooleanFieldMapperTests.java

License:Apache License

public void testDefaults() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject()
            .string();/*  ww w.  j  ava2s . co m*/

    DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping));

    ParsedDocument doc = defaultMapper.parse("test", "type", "1",
            XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes());

    try (Directory dir = new RAMDirectory();
            IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(getRandom())))) {
        w.addDocuments(doc.docs());
        try (DirectoryReader reader = DirectoryReader.open(w, true)) {
            final LeafReader leaf = reader.leaves().get(0).reader();
            // boolean fields are indexed and have doc values by default
            assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next());
            SortedNumericDocValues values = leaf.getSortedNumericDocValues("field");
            assertNotNull(values);
            values.setDocument(0);
            assertEquals(1, values.count());
            assertEquals(1, values.valueAt(0));
        }
    }
}

From source file:org.elasticsearch.index.mapper.core.TextFieldMapperTests.java

License:Apache License

public void testDefaultPositionIncrementGap() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "text").endObject().endObject().endObject().endObject()
            .string();//from   w w w  . j av a  2s  . c  om

    DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping),
            MergeReason.MAPPING_UPDATE, false);

    assertEquals(mapping, mapper.mappingSource().toString());

    ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject()
            .field("field", new String[] { "a", "b" }).endObject().bytes());

    IndexableField[] fields = doc.rootDoc().getFields("field");
    assertEquals(2, fields.length);

    assertEquals("a", fields[0].stringValue());
    assertEquals("b", fields[1].stringValue());

    IndexShard shard = indexService.getShard(0);
    shard.index(new Engine.Index(new Term("_uid", "1"), doc));
    shard.refresh("test");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
        TermsEnum terms = leaf.terms("field").iterator();
        assertTrue(terms.seekExact(new BytesRef("b")));
        PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
        assertEquals(0, postings.nextDoc());
        assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition());
    }
}

From source file:org.elasticsearch.index.mapper.core.TextFieldMapperTests.java

License:Apache License

public void testPositionIncrementGap() throws IOException {
    final int positionIncrementGap = randomIntBetween(1, 1000);
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "text").field("position_increment_gap", positionIncrementGap)
            .endObject().endObject().endObject().endObject().string();

    DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping),
            MergeReason.MAPPING_UPDATE, false);

    assertEquals(mapping, mapper.mappingSource().toString());

    ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject()
            .field("field", new String[] { "a", "b" }).endObject().bytes());

    IndexableField[] fields = doc.rootDoc().getFields("field");
    assertEquals(2, fields.length);//from  w  ww  .j  a v  a 2s.com

    assertEquals("a", fields[0].stringValue());
    assertEquals("b", fields[1].stringValue());

    IndexShard shard = indexService.getShard(0);
    shard.index(new Engine.Index(new Term("_uid", "1"), doc));
    shard.refresh("test");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
        TermsEnum terms = leaf.terms("field").iterator();
        assertTrue(terms.seekExact(new BytesRef("b")));
        PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
        assertEquals(0, postings.nextDoc());
        assertEquals(positionIncrementGap + 1, postings.nextPosition());
    }
}