org.apache.solr.uninverting.TestFieldCacheVsDocValues.java Source code

Introduction

Here is the source code for org.apache.solr.uninverting.TestFieldCacheVsDocValues.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.uninverting;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.index.SlowCompositeReaderWrapper;

import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

public class TestFieldCacheVsDocValues extends LuceneTestCase {

    @Override
    public void setUp() throws Exception {
        super.setUp();
        assumeFalse("test unsupported on J9 temporarily, see https://issues.apache.org/jira/browse/LUCENE-6522",
                Constants.JAVA_VENDOR.startsWith("IBM"));
    }

    public void testByteMissingVsFieldCache() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            doTestMissingVsFieldCache(Byte.MIN_VALUE, Byte.MAX_VALUE);
        }
    }

    public void testShortMissingVsFieldCache() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            doTestMissingVsFieldCache(Short.MIN_VALUE, Short.MAX_VALUE);
        }
    }

    public void testIntMissingVsFieldCache() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            doTestMissingVsFieldCache(Integer.MIN_VALUE, Integer.MAX_VALUE);
        }
    }

    public void testLongMissingVsFieldCache() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            doTestMissingVsFieldCache(Long.MIN_VALUE, Long.MAX_VALUE);
        }
    }

    public void testSortedFixedLengthVsFieldCache() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            int fixedLength = TestUtil.nextInt(random(), 1, 10);
            doTestSortedVsFieldCache(fixedLength, fixedLength);
        }
    }

    public void testSortedVariableLengthVsFieldCache() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            doTestSortedVsFieldCache(1, 10);
        }
    }

    public void testSortedSetFixedLengthVsUninvertedField() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            int fixedLength = TestUtil.nextInt(random(), 1, 10);
            doTestSortedSetVsUninvertedField(fixedLength, fixedLength);
        }
    }

    public void testSortedSetVariableLengthVsUninvertedField() throws Exception {
        int numIterations = atLeast(1);
        for (int i = 0; i < numIterations; i++) {
            doTestSortedSetVsUninvertedField(1, 10);
        }
    }

    // LUCENE-4853
    public void testHugeBinaryValues() throws Exception {
        Analyzer analyzer = new MockAnalyzer(random());
        // FSDirectory because SimpleText will consume gobbs of
        // space when storing big binary values:
        Directory d = newFSDirectory(createTempDir("hugeBinaryValues"));
        boolean doFixed = random().nextBoolean();
        int numDocs;
        int fixedLength = 0;
        if (doFixed) {
            // Sometimes make all values fixed length since some
            // codecs have different code paths for this:
            numDocs = TestUtil.nextInt(random(), 10, 20);
            fixedLength = TestUtil.nextInt(random(), 65537, 256 * 1024);
        } else {
            numDocs = TestUtil.nextInt(random(), 100, 200);
        }
        IndexWriter w = new IndexWriter(d, newIndexWriterConfig(analyzer));
        List<byte[]> docBytes = new ArrayList<>();
        long totalBytes = 0;
        for (int docID = 0; docID < numDocs; docID++) {
            // we don't use RandomIndexWriter because it might add
            // more docvalues than we expect !!!!

            // Must be > 64KB in size to ensure more than 2 pages in
            // PagedBytes would be needed:
            int numBytes;
            if (doFixed) {
                numBytes = fixedLength;
            } else if (docID == 0 || random().nextInt(5) == 3) {
                numBytes = TestUtil.nextInt(random(), 65537, 3 * 1024 * 1024);
            } else {
                numBytes = TestUtil.nextInt(random(), 1, 1024 * 1024);
            }
            totalBytes += numBytes;
            if (totalBytes > 5 * 1024 * 1024) {
                break;
            }
            byte[] bytes = new byte[numBytes];
            random().nextBytes(bytes);
            docBytes.add(bytes);
            Document doc = new Document();
            BytesRef b = new BytesRef(bytes);
            b.length = bytes.length;
            doc.add(new BinaryDocValuesField("field", b));
            doc.add(new StringField("id", "" + docID, Field.Store.YES));
            try {
                w.addDocument(doc);
            } catch (IllegalArgumentException iae) {
                if (iae.getMessage().indexOf("is too large") == -1) {
                    throw iae;
                } else {
                    // OK: some codecs can't handle binary DV > 32K
                    assertFalse(codecAcceptsHugeBinaryValues("field"));
                    w.rollback();
                    d.close();
                    return;
                }
            }
        }

        DirectoryReader r;
        try {
            r = DirectoryReader.open(w);
        } catch (IllegalArgumentException iae) {
            if (iae.getMessage().indexOf("is too large") == -1) {
                throw iae;
            } else {
                assertFalse(codecAcceptsHugeBinaryValues("field"));

                // OK: some codecs can't handle binary DV > 32K
                w.rollback();
                d.close();
                return;
            }
        }
        w.close();

        LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
        TestUtil.checkReader(ar);

        BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
        for (int docID = 0; docID < docBytes.size(); docID++) {
            Document doc = ar.document(docID);
            assertEquals(docID, s.nextDoc());
            BytesRef bytes = s.binaryValue();
            byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
            assertEquals(expected.length, bytes.length);
            assertEquals(new BytesRef(expected), bytes);
        }

        assertTrue(codecAcceptsHugeBinaryValues("field"));

        ar.close();
        d.close();
    }

    private static final int LARGE_BINARY_FIELD_LENGTH = (1 << 15) - 2;

    // TODO: get this out of here and into the deprecated codecs (4.0, 4.2)
    public void testHugeBinaryValueLimit() throws Exception {
        // We only test DVFormats that have a limit
        assumeFalse("test requires codec with limits on max binary field length",
                codecAcceptsHugeBinaryValues("field"));
        Analyzer analyzer = new MockAnalyzer(random());
        // FSDirectory because SimpleText will consume gobbs of
        // space when storing big binary values:
        Directory d = newFSDirectory(createTempDir("hugeBinaryValues"));
        boolean doFixed = random().nextBoolean();
        int numDocs;
        int fixedLength = 0;
        if (doFixed) {
            // Sometimes make all values fixed length since some
            // codecs have different code paths for this:
            numDocs = TestUtil.nextInt(random(), 10, 20);
            fixedLength = LARGE_BINARY_FIELD_LENGTH;
        } else {
            numDocs = TestUtil.nextInt(random(), 100, 200);
        }
        IndexWriter w = new IndexWriter(d, newIndexWriterConfig(analyzer));
        List<byte[]> docBytes = new ArrayList<>();
        long totalBytes = 0;
        for (int docID = 0; docID < numDocs; docID++) {
            // we don't use RandomIndexWriter because it might add
            // more docvalues than we expect !!!!

            // Must be > 64KB in size to ensure more than 2 pages in
            // PagedBytes would be needed:
            int numBytes;
            if (doFixed) {
                numBytes = fixedLength;
            } else if (docID == 0 || random().nextInt(5) == 3) {
                numBytes = LARGE_BINARY_FIELD_LENGTH;
            } else {
                numBytes = TestUtil.nextInt(random(), 1, LARGE_BINARY_FIELD_LENGTH);
            }
            totalBytes += numBytes;
            if (totalBytes > 5 * 1024 * 1024) {
                break;
            }
            byte[] bytes = new byte[numBytes];
            random().nextBytes(bytes);
            docBytes.add(bytes);
            Document doc = new Document();
            BytesRef b = new BytesRef(bytes);
            b.length = bytes.length;
            doc.add(new BinaryDocValuesField("field", b));
            doc.add(new StringField("id", "" + docID, Field.Store.YES));
            w.addDocument(doc);
        }

        DirectoryReader r = DirectoryReader.open(w);
        w.close();

        LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
        TestUtil.checkReader(ar);

        BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
        for (int docID = 0; docID < docBytes.size(); docID++) {
            assertEquals(docID, s.nextDoc());
            Document doc = ar.document(docID);
            BytesRef bytes = s.binaryValue();
            byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
            assertEquals(expected.length, bytes.length);
            assertEquals(new BytesRef(expected), bytes);
        }

        ar.close();
        d.close();
    }

    private void doTestSortedVsFieldCache(int minLength, int maxLength) throws Exception {
        Directory dir = newDirectory();
        IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
        RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
        Document doc = new Document();
        Field idField = new StringField("id", "", Field.Store.NO);
        Field indexedField = new StringField("indexed", "", Field.Store.NO);
        Field dvField = new SortedDocValuesField("dv", new BytesRef());
        doc.add(idField);
        doc.add(indexedField);
        doc.add(dvField);

        // index some docs
        int numDocs = atLeast(300);
        for (int i = 0; i < numDocs; i++) {
            idField.setStringValue(Integer.toString(i));
            final int length;
            if (minLength == maxLength) {
                length = minLength; // fixed length
            } else {
                length = TestUtil.nextInt(random(), minLength, maxLength);
            }
            String value = TestUtil.randomSimpleString(random(), length);
            indexedField.setStringValue(value);
            dvField.setBytesValue(new BytesRef(value));
            writer.addDocument(doc);
            if (random().nextInt(31) == 0) {
                writer.commit();
            }
        }

        // delete some docs
        int numDeletions = random().nextInt(numDocs / 10);
        for (int i = 0; i < numDeletions; i++) {
            int id = random().nextInt(numDocs);
            writer.deleteDocuments(new Term("id", Integer.toString(id)));
        }
        writer.close();

        // compare
        DirectoryReader ir = DirectoryReader.open(dir);
        for (LeafReaderContext context : ir.leaves()) {
            LeafReader r = context.reader();
            SortedDocValues expected = FieldCache.DEFAULT.getTermsIndex(r, "indexed");
            SortedDocValues actual = r.getSortedDocValues("dv");
            assertEquals(r.maxDoc(), expected, actual);
        }
        ir.close();
        dir.close();
    }

    private void doTestSortedSetVsUninvertedField(int minLength, int maxLength) throws Exception {
        Directory dir = newDirectory();
        IndexWriterConfig conf = new IndexWriterConfig(new MockAnalyzer(random()));
        RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);

        // index some docs
        int numDocs = atLeast(300);
        for (int i = 0; i < numDocs; i++) {
            Document doc = new Document();
            Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
            doc.add(idField);
            final int length = TestUtil.nextInt(random(), minLength, maxLength);
            int numValues = random().nextInt(17);
            // create a random list of strings
            List<String> values = new ArrayList<>();
            for (int v = 0; v < numValues; v++) {
                values.add(TestUtil.randomSimpleString(random(), minLength, length));
            }

            // add in any order to the indexed field
            ArrayList<String> unordered = new ArrayList<>(values);
            Collections.shuffle(unordered, random());
            for (String v : values) {
                doc.add(newStringField("indexed", v, Field.Store.NO));
            }

            // add in any order to the dv field
            ArrayList<String> unordered2 = new ArrayList<>(values);
            Collections.shuffle(unordered2, random());
            for (String v : unordered2) {
                doc.add(new SortedSetDocValuesField("dv", new BytesRef(v)));
            }

            writer.addDocument(doc);
            if (random().nextInt(31) == 0) {
                writer.commit();
            }
        }

        // delete some docs
        int numDeletions = random().nextInt(numDocs / 10);
        for (int i = 0; i < numDeletions; i++) {
            int id = random().nextInt(numDocs);
            writer.deleteDocuments(new Term("id", Integer.toString(id)));
        }

        // compare per-segment
        DirectoryReader ir = writer.getReader();
        for (LeafReaderContext context : ir.leaves()) {
            LeafReader r = context.reader();
            SortedSetDocValues expected = FieldCache.DEFAULT.getDocTermOrds(r, "indexed", null);
            SortedSetDocValues actual = r.getSortedSetDocValues("dv");
            assertEquals(r.maxDoc(), expected, actual);
        }
        ir.close();

        writer.forceMerge(1);

        // now compare again after the merge
        ir = writer.getReader();
        LeafReader ar = getOnlyLeafReader(ir);
        SortedSetDocValues expected = FieldCache.DEFAULT.getDocTermOrds(ar, "indexed", null);
        SortedSetDocValues actual = ar.getSortedSetDocValues("dv");
        assertEquals(ir.maxDoc(), expected, actual);
        ir.close();

        writer.close();
        dir.close();
    }

    private void doTestMissingVsFieldCache(LongProducer longs) throws Exception {
        Directory dir = newDirectory();
        IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
        RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
        Field idField = new StringField("id", "", Field.Store.NO);
        Field indexedField = newStringField("indexed", "", Field.Store.NO);
        Field dvField = new NumericDocValuesField("dv", 0);

        // index some docs
        int numDocs = atLeast(300);
        // numDocs should be always > 256 so that in case of a codec that optimizes
        // for numbers of values <= 256, all storage layouts are tested
        assert numDocs > 256;
        for (int i = 0; i < numDocs; i++) {
            idField.setStringValue(Integer.toString(i));
            long value = longs.next();
            indexedField.setStringValue(Long.toString(value));
            dvField.setLongValue(value);
            Document doc = new Document();
            doc.add(idField);
            // 1/4 of the time we neglect to add the fields
            if (random().nextInt(4) > 0) {
                doc.add(indexedField);
                doc.add(dvField);
            }
            writer.addDocument(doc);
            if (random().nextInt(31) == 0) {
                writer.commit();
            }
        }

        // delete some docs
        int numDeletions = random().nextInt(numDocs / 10);
        for (int i = 0; i < numDeletions; i++) {
            int id = random().nextInt(numDocs);
            writer.deleteDocuments(new Term("id", Integer.toString(id)));
        }

        // merge some segments and ensure that at least one of them has more than
        // 256 values
        writer.forceMerge(numDocs / 256);

        writer.close();

        // compare
        DirectoryReader ir = DirectoryReader.open(dir);
        for (LeafReaderContext context : ir.leaves()) {
            LeafReader r = context.reader();
            Bits expected = FieldCache.DEFAULT.getDocsWithField(r, "indexed", null);
            Bits actual = FieldCache.DEFAULT.getDocsWithField(r, "dv", null);
            assertEquals(expected, actual);
        }
        ir.close();
        dir.close();
    }

    private void doTestMissingVsFieldCache(final long minValue, final long maxValue) throws Exception {
        doTestMissingVsFieldCache(new LongProducer() {
            @Override
            long next() {
                return TestUtil.nextLong(random(), minValue, maxValue);
            }
        });
    }

    static abstract class LongProducer {
        abstract long next();
    }

    private void assertEquals(Bits expected, Bits actual) throws Exception {
        assertEquals(expected.length(), actual.length());
        for (int i = 0; i < expected.length(); i++) {
            assertEquals(expected.get(i), actual.get(i));
        }
    }

    private void assertEquals(int maxDoc, SortedDocValues expected, SortedDocValues actual) throws Exception {
        // can be null for the segment if no docs actually had any SortedDocValues
        // in this case FC.getDocTermsOrds returns EMPTY
        if (actual == null) {
            assertEquals(expected.getValueCount(), 0);
            return;
        }
        assertEquals(expected.getValueCount(), actual.getValueCount());

        // compare ord lists
        while (true) {
            int docID = expected.nextDoc();
            if (docID == NO_MORE_DOCS) {
                assertEquals(NO_MORE_DOCS, actual.nextDoc());
                break;
            }
            assertEquals(docID, actual.nextDoc());
            assertEquals(expected.ordValue(), actual.ordValue());
            assertEquals(expected.binaryValue(), actual.binaryValue());
        }

        // compare ord dictionary
        for (long i = 0; i < expected.getValueCount(); i++) {
            final BytesRef expectedBytes = BytesRef.deepCopyOf(expected.lookupOrd((int) i));
            final BytesRef actualBytes = actual.lookupOrd((int) i);
            assertEquals(expectedBytes, actualBytes);
        }

        // compare termsenum
        assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum());
    }

    private void assertEquals(int maxDoc, SortedSetDocValues expected, SortedSetDocValues actual) throws Exception {
        // can be null for the segment if no docs actually had any SortedDocValues
        // in this case FC.getDocTermsOrds returns EMPTY
        if (actual == null) {
            assertEquals(expected.getValueCount(), 0);
            return;
        }
        assertEquals(expected.getValueCount(), actual.getValueCount());
        while (true) {
            int docID = expected.nextDoc();
            assertEquals(docID, actual.nextDoc());
            if (docID == NO_MORE_DOCS) {
                break;
            }
            long expectedOrd;
            while ((expectedOrd = expected.nextOrd()) != NO_MORE_ORDS) {
                assertEquals(expectedOrd, actual.nextOrd());
            }
            assertEquals(NO_MORE_ORDS, actual.nextOrd());
        }

        // compare ord dictionary
        for (long i = 0; i < expected.getValueCount(); i++) {
            final BytesRef expectedBytes = BytesRef.deepCopyOf(expected.lookupOrd(i));
            final BytesRef actualBytes = actual.lookupOrd(i);
            assertEquals(expectedBytes, actualBytes);
        }

        // compare termsenum
        assertEquals(expected.getValueCount(), expected.termsEnum(), actual.termsEnum());
    }

    private void assertEquals(long numOrds, TermsEnum expected, TermsEnum actual) throws Exception {
        BytesRef ref;

        // sequential next() through all terms
        while ((ref = expected.next()) != null) {
            assertEquals(ref, actual.next());
            assertEquals(expected.ord(), actual.ord());
            assertEquals(expected.term(), actual.term());
        }
        assertNull(actual.next());

        // sequential seekExact(ord) through all terms
        for (long i = 0; i < numOrds; i++) {
            expected.seekExact(i);
            actual.seekExact(i);
            assertEquals(expected.ord(), actual.ord());
            assertEquals(expected.term(), actual.term());
        }

        // sequential seekExact(BytesRef) through all terms
        for (long i = 0; i < numOrds; i++) {
            expected.seekExact(i);
            assertTrue(actual.seekExact(expected.term()));
            assertEquals(expected.ord(), actual.ord());
            assertEquals(expected.term(), actual.term());
        }

        // sequential seekCeil(BytesRef) through all terms
        for (long i = 0; i < numOrds; i++) {
            expected.seekExact(i);
            assertEquals(SeekStatus.FOUND, actual.seekCeil(expected.term()));
            assertEquals(expected.ord(), actual.ord());
            assertEquals(expected.term(), actual.term());
        }

        // random seekExact(ord)
        for (long i = 0; i < numOrds; i++) {
            long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
            expected.seekExact(randomOrd);
            actual.seekExact(randomOrd);
            assertEquals(expected.ord(), actual.ord());
            assertEquals(expected.term(), actual.term());
        }

        // random seekExact(BytesRef)
        for (long i = 0; i < numOrds; i++) {
            long randomOrd = TestUtil.nextLong(random(), 0, numOrds - 1);
            expected.seekExact(randomOrd);
            actual.seekExact(expected.term());
            assertEquals(expected.ord(), actual.ord());
            assertEquals(expected.term(), actual.term());
        }

        // random seekCeil(BytesRef)
        for (long i = 0; i < numOrds; i++) {
            BytesRef target = new BytesRef(TestUtil.randomUnicodeString(random()));
            SeekStatus expectedStatus = expected.seekCeil(target);
            assertEquals(expectedStatus, actual.seekCeil(target));
            if (expectedStatus != SeekStatus.END) {
                assertEquals(expected.ord(), actual.ord());
                assertEquals(expected.term(), actual.term());
            }
        }
    }

    protected boolean codecAcceptsHugeBinaryValues(String field) {
        String name = TestUtil.getDocValuesFormat(field);
        return !(name.equals("Memory")); // Direct has a different type of limit
    }
}