Example usage for org.apache.lucene.index LeafReader terms

List of usage examples for org.apache.lucene.index LeafReader terms

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Returns the Terms index for this field, or null if it has none.

Usage

From source file:org.apache.solr.uninverting.DocTermOrds.java

License:Apache License

/** Call this only once (if you subclass!) */
protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException(
                "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }//ww w  .  j  a v a  2s.  c o m
    //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    final int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
    final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
    final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)

    final Terms terms = reader.terms(field);
    if (terms == null) {
        // No terms
        return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    //System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
        // No terms match
        return;
    }

    // For our "term index wrapper"
    final List<BytesRef> indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (;;) {
        final BytesRef t = te.term();
        if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
            break;
        }
        //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

        visitTerm(te, termNum);

        if ((termNum & indexIntervalMask) == 0) {
            // Index this term
            sizeOfIndexedStrings += t.length;
            BytesRef indexedTerm = new BytesRef();
            indexedTermsBytes.copy(t, indexedTerm);
            // TODO: really should 1) strip off useless suffix,
            // and 2) use FST not array/PagedBytes
            indexedTerms.add(indexedTerm);
        }

        final int df = te.docFreq();
        if (df <= maxTermDocFreq) {

            postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

            // dF, but takes deletions into account
            int actualDF = 0;

            for (;;) {
                int doc = postingsEnum.nextDoc();
                if (doc == DocIdSetIterator.NO_MORE_DOCS) {
                    break;
                }
                //System.out.println("  chunk=" + chunk + " docs");

                actualDF++;
                termInstances++;

                //System.out.println("    docID=" + doc);
                // add TNUM_OFFSET to the term number to make room for special reserved values:
                // 0 (end term) and 1 (index into byte array follows)
                int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
                lastTerm[doc] = termNum;
                int val = index[doc];

                if ((val & 0xff) == 1) {
                    // index into byte array (actually the end of
                    // the doc-specific byte[] when building)
                    int pos = val >>> 8;
                    int ilen = vIntSize(delta);
                    byte[] arr = bytes[doc];
                    int newend = pos + ilen;
                    if (newend > arr.length) {
                        // We avoid a doubling strategy to lower memory usage.
                        // this faceting method isn't for docs with many terms.
                        // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
                        // TODO: figure out what array lengths we can round up to w/o actually using more memory
                        // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
                        // It should be safe to round up to the nearest 32 bits in any case.
                        int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
                        byte[] newarr = new byte[newLen];
                        System.arraycopy(arr, 0, newarr, 0, pos);
                        arr = newarr;
                        bytes[doc] = newarr;
                    }
                    pos = writeInt(delta, arr, pos);
                    index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
                } else {
                    // OK, this int has data in it... find the end (a zero starting byte - not
                    // part of another number, hence not following a byte with the high bit set).
                    int ipos;
                    if (val == 0) {
                        ipos = 0;
                    } else if ((val & 0x0000ff80) == 0) {
                        ipos = 1;
                    } else if ((val & 0x00ff8000) == 0) {
                        ipos = 2;
                    } else if ((val & 0xff800000) == 0) {
                        ipos = 3;
                    } else {
                        ipos = 4;
                    }

                    //System.out.println("      ipos=" + ipos);

                    int endPos = writeInt(delta, tempArr, ipos);
                    //System.out.println("      endpos=" + endPos);
                    if (endPos <= 4) {
                        //System.out.println("      fits!");
                        // value will fit in the integer... move bytes back
                        for (int j = ipos; j < endPos; j++) {
                            val |= (tempArr[j] & 0xff) << (j << 3);
                        }
                        index[doc] = val;
                    } else {
                        // value won't fit... move integer into byte[]
                        for (int j = 0; j < ipos; j++) {
                            tempArr[j] = (byte) val;
                            val >>>= 8;
                        }
                        // point at the end index in the byte[]
                        index[doc] = (endPos << 8) | 1;
                        bytes[doc] = tempArr;
                        tempArr = new byte[12];
                    }
                }
            }
            setActualDocFreq(termNum, actualDF);
        }

        termNum++;
        if (te.next() == null) {
            break;
        }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
        // we didn't invert anything
        // lower memory consumption.
        tnums = null;
    } else {

        this.index = index;

        //
        // transform intermediate form into the final form, building a single byte[]
        // at a time, and releasing the intermediate byte[]s as we go to avoid
        // increasing the memory footprint.
        //

        for (int pass = 0; pass < 256; pass++) {
            byte[] target = tnums[pass];
            int pos = 0; // end in target;
            if (target != null) {
                pos = target.length;
            } else {
                target = new byte[4096];
            }

            // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
            // where pp is the pass (which array we are building), and xx is all values.
            // each pass shares the same byte[] for termNumber lists.
            for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
                int lim = Math.min(docbase + (1 << 16), maxDoc);
                for (int doc = docbase; doc < lim; doc++) {
                    //System.out.println("  pass=" + pass + " process docID=" + doc);
                    int val = index[doc];
                    if ((val & 0xff) == 1) {
                        int len = val >>> 8;
                        //System.out.println("    ptr pos=" + pos);
                        index[doc] = (pos << 8) | 1; // change index to point to start of array
                        if ((pos & 0xff000000) != 0) {
                            // we only have 24 bits for the array index
                            throw new IllegalStateException(
                                    "Too many values for UnInvertedField faceting on field " + field);
                        }
                        byte[] arr = bytes[doc];
                        /*
                        for(byte b : arr) {
                          //System.out.println("      b=" + Integer.toHexString((int) b));
                        }
                        */
                        bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
                        if (target.length <= pos + len) {
                            int newlen = target.length;
                            /*** we don't have to worry about the array getting too large
                             * since the "pos" param will overflow first (only 24 bits available)
                            if ((newlen<<1) <= 0) {
                              // overflow...
                              newlen = Integer.MAX_VALUE;
                              if (newlen <= pos + len) {
                                throw new SolrException(400,"Too many terms to uninvert field!");
                              }
                            } else {
                              while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                            }
                            ****/
                            while (newlen <= pos + len)
                                newlen <<= 1; // doubling strategy                 
                            byte[] newtarget = new byte[newlen];
                            System.arraycopy(target, 0, newtarget, 0, pos);
                            target = newtarget;
                        }
                        System.arraycopy(arr, 0, target, pos, len);
                        pos += len + 1; // skip single byte at end and leave it 0 for terminator
                    }
                }
            }

            // shrink array
            if (pos < target.length) {
                byte[] newtarget = new byte[pos];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
            }

            tnums[pass] = target;

            if ((pass << 16) > maxDoc)
                break;
        }

    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
}

From source file:org.apache.solr.uninverting.FieldCacheImpl.java

License:Apache License

public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException {
    // not a general purpose filtering mechanism...
    assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;

    SortedSetDocValues dv = reader.getSortedSetDocValues(field);
    if (dv != null) {
        return dv;
    }//from ww w  .  j  a va  2s . c  o  m

    SortedDocValues sdv = reader.getSortedDocValues(field);
    if (sdv != null) {
        return DocValues.singleton(sdv);
    }

    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (info == null) {
        return DocValues.emptySortedSet();
    } else if (info.getDocValuesType() != DocValuesType.NONE) {
        throw new IllegalStateException(
                "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    } else if (info.getIndexOptions() == IndexOptions.NONE) {
        return DocValues.emptySortedSet();
    }

    // ok we need to uninvert. check if we can optimize a bit.

    Terms terms = reader.terms(field);
    if (terms == null) {
        return DocValues.emptySortedSet();
    } else {
        // if #postings = #docswithfield we know that the field is "single valued enough".
        // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
        // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
        long numPostings = terms.getSumDocFreq();
        if (numPostings != -1 && numPostings == terms.getDocCount()) {
            return DocValues.singleton(getTermsIndex(reader, field));
        }
    }

    DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix));
    return dto.iterator(reader);
}

From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java

License:Apache License

private void execute(Path inputFile, Path commonTokensFile) throws Exception {
    Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
    AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
    try {/*w  w  w  . j  a  va 2  s  .  c  o m*/
        Directory directory = FSDirectory.open(luceneDir);
        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

        Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        int maxLen = 1000000;
        int len = 0;
        try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
            List<Document> docs = new ArrayList<>();
            try (BufferedReader reader = getReader(inputFile)) {
                String line = reader.readLine();
                while (line != null) {
                    len += line.length();
                    Document document = new Document();
                    document.add(new TextField(FIELD, line, Field.Store.NO));
                    docs.add(document);
                    if (len > maxLen) {
                        writer.addDocuments(docs);
                        docs.clear();
                        len = 0;
                    }
                    line = reader.readLine();
                }
            }
            if (docs.size() > 0) {
                writer.addDocuments(docs);
            }
            writer.commit();
            writer.flush();
        }
        try (IndexReader reader = DirectoryReader.open(directory)) {
            LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
            Terms terms = wrappedReader.terms(FIELD);
            TermsEnum termsEnum = terms.iterator();
            BytesRef bytesRef = termsEnum.next();
            int docsWThisField = wrappedReader.getDocCount(FIELD);
            while (bytesRef != null) {
                int df = termsEnum.docFreq();
                long tf = termsEnum.totalTermFreq();
                if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                    bytesRef = termsEnum.next();
                    continue;
                }

                if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                    String t = bytesRef.utf8ToString();
                    if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) {
                        queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                    }

                }
                bytesRef = termsEnum.next();
            }
        }
    } finally {
        FileUtils.deleteDirectory(luceneDir.toFile());
    }

    writeTopN(commonTokensFile, queue);

}

From source file:org.codelibs.elasticsearch.search.slice.TermsSliceQuery.java

License:Apache License

/**
 * Returns a DocIdSet per segments containing the matching docs for the specified slice.
 *//* ww w.  j  a va2s. c om*/
private DocIdSet build(LeafReader reader) throws IOException {
    final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
    final Terms terms = reader.terms(getField());
    final TermsEnum te = terms.iterator();
    PostingsEnum docsEnum = null;
    for (BytesRef term = te.next(); term != null; term = te.next()) {
        int hashCode = term.hashCode();
        if (contains(hashCode)) {
            docsEnum = te.postings(docsEnum, PostingsEnum.NONE);
            builder.add(docsEnum);
        }
    }
    return builder.build();
}

From source file:org.elasticsearch.common.lucene.uid.PerThreadIDVersionAndSeqNoLookup.java

License:Apache License

/**
 * Initialize lookup for the provided segment
 *///w  ww.j a va2s .c  om
PerThreadIDVersionAndSeqNoLookup(LeafReader reader, String uidField) throws IOException {
    this.uidField = uidField;
    Terms terms = reader.terms(uidField);
    if (terms == null) {
        throw new IllegalArgumentException("reader misses the [" + uidField + "] field");
    }
    termsEnum = terms.iterator();
    if (reader.getNumericDocValues(VersionFieldMapper.NAME) == null) {
        throw new IllegalArgumentException("reader misses the [" + VersionFieldMapper.NAME + "] field");
    }

    Object readerKey = null;
    assert (readerKey = reader.getCoreCacheHelper().getKey()) != null;
    this.readerKey = readerKey;
}

From source file:org.elasticsearch.index.fielddata.plain.GeoPointArrayIndexFieldData.java

License:Apache License

@Override
public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception {
    LeafReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    AtomicGeoPointFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(
            breakerService.getBreaker(CircuitBreaker.FIELDDATA));
    if (terms == null) {
        data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.ramBytesUsed());
        return data;
    }/*from   ww  w.j  a  v a2s.co m*/
    return (Version.indexCreated(indexSettings).before(Version.V_2_2_0))
            ? loadLegacyFieldData(reader, estimator, terms, data)
            : loadFieldData22(reader, estimator, terms, data);
}

From source file:org.elasticsearch.index.mapper.BooleanFieldMapperTests.java

License:Apache License

public void testDefaults() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject()
            .string();//from   w ww  . j ava  2 s. c  o m

    DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping));

    ParsedDocument doc = defaultMapper.parse("test", "type", "1",
            XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes());

    try (Directory dir = new RAMDirectory();
            IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) {
        w.addDocuments(doc.docs());
        try (DirectoryReader reader = DirectoryReader.open(w)) {
            final LeafReader leaf = reader.leaves().get(0).reader();
            // boolean fields are indexed and have doc values by default
            assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next());
            SortedNumericDocValues values = leaf.getSortedNumericDocValues("field");
            assertNotNull(values);
            values.setDocument(0);
            assertEquals(1, values.count());
            assertEquals(1, values.valueAt(0));
        }
    }
}

From source file:org.elasticsearch.index.mapper.core.BooleanFieldMapperTests.java

License:Apache License

public void testDefaults() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "boolean").endObject().endObject().endObject().endObject()
            .string();/*  ww w.  j  ava2s . co m*/

    DocumentMapper defaultMapper = parser.parse("type", new CompressedXContent(mapping));

    ParsedDocument doc = defaultMapper.parse("test", "type", "1",
            XContentFactory.jsonBuilder().startObject().field("field", true).endObject().bytes());

    try (Directory dir = new RAMDirectory();
            IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(getRandom())))) {
        w.addDocuments(doc.docs());
        try (DirectoryReader reader = DirectoryReader.open(w, true)) {
            final LeafReader leaf = reader.leaves().get(0).reader();
            // boolean fields are indexed and have doc values by default
            assertEquals(new BytesRef("T"), leaf.terms("field").iterator().next());
            SortedNumericDocValues values = leaf.getSortedNumericDocValues("field");
            assertNotNull(values);
            values.setDocument(0);
            assertEquals(1, values.count());
            assertEquals(1, values.valueAt(0));
        }
    }
}

From source file:org.elasticsearch.index.mapper.core.TextFieldMapperTests.java

License:Apache License

public void testDefaultPositionIncrementGap() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "text").endObject().endObject().endObject().endObject()
            .string();//from   w w w  . j av a  2s  . c  om

    DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping),
            MergeReason.MAPPING_UPDATE, false);

    assertEquals(mapping, mapper.mappingSource().toString());

    ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject()
            .field("field", new String[] { "a", "b" }).endObject().bytes());

    IndexableField[] fields = doc.rootDoc().getFields("field");
    assertEquals(2, fields.length);

    assertEquals("a", fields[0].stringValue());
    assertEquals("b", fields[1].stringValue());

    IndexShard shard = indexService.getShard(0);
    shard.index(new Engine.Index(new Term("_uid", "1"), doc));
    shard.refresh("test");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
        TermsEnum terms = leaf.terms("field").iterator();
        assertTrue(terms.seekExact(new BytesRef("b")));
        PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
        assertEquals(0, postings.nextDoc());
        assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition());
    }
}

From source file:org.elasticsearch.index.mapper.core.TextFieldMapperTests.java

License:Apache License

public void testPositionIncrementGap() throws IOException {
    final int positionIncrementGap = randomIntBetween(1, 1000);
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties")
            .startObject("field").field("type", "text").field("position_increment_gap", positionIncrementGap)
            .endObject().endObject().endObject().endObject().string();

    DocumentMapper mapper = indexService.mapperService().merge("type", new CompressedXContent(mapping),
            MergeReason.MAPPING_UPDATE, false);

    assertEquals(mapping, mapper.mappingSource().toString());

    ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject()
            .field("field", new String[] { "a", "b" }).endObject().bytes());

    IndexableField[] fields = doc.rootDoc().getFields("field");
    assertEquals(2, fields.length);//from  w  ww  .j  a v  a 2s.com

    assertEquals("a", fields[0].stringValue());
    assertEquals("b", fields[1].stringValue());

    IndexShard shard = indexService.getShard(0);
    shard.index(new Engine.Index(new Term("_uid", "1"), doc));
    shard.refresh("test");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
        TermsEnum terms = leaf.terms("field").iterator();
        assertTrue(terms.seekExact(new BytesRef("b")));
        PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
        assertEquals(0, postings.nextDoc());
        assertEquals(positionIncrementGap + 1, postings.nextPosition());
    }
}