lux.IndexTest.java Source code

Introduction

Here is the source code for lux.IndexTest.java
Source

package lux;

import static lux.index.IndexConfiguration.*;
import static org.junit.Assert.*;

import java.io.ByteArrayInputStream;
import java.io.IOException;

import javax.xml.stream.XMLStreamException;

import lux.index.FieldRole;
import lux.index.IndexConfiguration;
import lux.index.XmlIndexer;
import lux.index.field.FieldDefinition;
import lux.index.field.FieldDefinition.Type;
import lux.index.field.XPathField;
import lux.query.parser.XmlQueryParser;
import lux.search.LuxSearcher;
import net.sf.saxon.s9api.SaxonApiException;

import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.surround.parser.ParseException;
import org.apache.lucene.queryparser.surround.parser.QueryParser;
import org.apache.lucene.queryparser.surround.query.BasicQueryFactory;
import org.apache.lucene.queryparser.surround.query.SrndQuery;
import org.apache.lucene.queryparser.xml.ParserException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.RAMDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;

/**
 * Measures space and time for different indexing options, and validates
 * indexing results.
 * 
 * The timings are off; we'd need to run this repeatedly to avoid transient
 * startup effects which overwhelm the measurements for a single run.
 * 
 * But the space numbers (in bytes) should be valid (from Directory.sizeInBytes()):
 * XML storage: 3664896 (3.5M)
 * qnames = 3692544 - 3664896 = 27648 = 0.75%
 * paths = 3717120 - 3664896 = 52224 = 1.4%
 * 
 * After refactoring XmlField, etc:
 * XML storage: 2274304  why did this shrink so much?  We're now using serializer instead
 * of JDOM outputter - could this all be whitespace from indentation or something?
 * qnames: 2302976 - 2274304 = 28672 = 1.3%
 * paths: 2328576 - 2274304 = 54272 = 2.4%
 * path-occurrences = 122880 = 5.1%
 * path-values alone: 755712
 * path-values (w/docs): 2714624 - 2274304 = 19%
 * qname-values (as phrases): 2631680 - 2274304 = 357376 = 16%
 * qname-values (hashed into single tokens): 2542592 - 2274304 = 11.8%
 * qname-words w/o terminal tokens: 2683904 - 2274304 = 18%
 * qname-words + terminal tokens: 2786304 - 2274304 = 22%
 * full-text (with all nodes transparent) 3899392 - 2274304 = 1625088 = 71% (1940480 full text alone)
 * full-text (text only) 2673664 - 2274304 = 399360 = 18%
 * full-text (text plus all nodes opaque) 3068928 - 2274304 = 35%
 * 
 */
public class IndexTest {

    private static final boolean GATHER_TIMING = false;
    private RAMDirectory dir;

    @Test
    public void testIndexPaths() throws Exception {
        buildIndex("paths and xml", INDEX_PATHS | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    private void reset() {
        dir.close();
        dir = new RAMDirectory();
    }

    @Test
    public void testIndexPathsOnly() throws Exception {
        IndexTestSupport indexTestSupport = buildIndex("paths", INDEX_PATHS | BUILD_DOCUMENT);
        assertTotalDocs();
        // printAllTerms(indexTestSupport);
        assertPathQuery(indexTestSupport);
    }

    @Test
    public void testIndexQNames() throws Exception {
        buildIndex("qnames and xml", INDEX_QNAMES | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testIndexQNamesOnly() throws Exception {
        buildIndex("qnames", INDEX_QNAMES | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testIndexPathOccurOnly() throws Exception {
        // IndexTestSupport indexTestSupport = 
        buildIndex("path-occurrences", INDEX_PATHS | INDEX_EACH_PATH | BUILD_DOCUMENT);
        // printAllTerms(indexTestSupport);
        assertTotalDocs();
    }

    @Test
    public void testIndexFullText() throws Exception {
        buildIndex("full-text", INDEX_FULLTEXT | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
        // printAllTerms(indexTestSupport);
    }

    @Test
    public void testIndexFullTextOnly() throws Exception {
        //IndexTestSupport indexTestSupport = 
        buildIndex("full-text-only", INDEX_FULLTEXT);
        assertTotalDocs();
        //printAllTerms(indexTestSupport);
    }

    private void assertPathQuery(IndexTestSupport indexTestSupport) throws ParseException, IOException {
        SrndQuery q = new QueryParser().parse2("w(w({},\"ACT\"),\"SCENE\")");
        Query q2 = q.makeLuceneQueryFieldNoBoost(
                indexTestSupport.indexer.getConfiguration().getFieldName(FieldRole.PATH), new BasicQueryFactory());
        DocIdSetIterator iter = indexTestSupport.searcher.search(q2);
        int count = 0;
        while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            ++count;
        }
        assertEquals(5, count);
    }

    private void assertFullTextQuery(IndexTestSupport indexTestSupport, String qName, String term,
            int expectedCount) throws IOException, ParserException {
        LuxSearcher searcher = indexTestSupport.searcher;
        XmlIndexer indexer = indexTestSupport.indexer;
        IndexConfiguration config = indexer.getConfiguration();
        FieldDefinition field = config.getField(FieldRole.ELEMENT_TEXT);
        Query q = new XmlQueryParser(field.getName(), field.getAnalyzer()).parse(new ByteArrayInputStream(
                ("<QNameTextQuery fieldName=\"" + config.getFieldName(FieldRole.ELEMENT_TEXT) + "\" qName=\""
                        + qName + "\">" + term + "</QNameTextQuery>").getBytes()));
        DocIdSetIterator iter = searcher.search(q);
        int count = 0;
        while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            ++count;
        }
        assertEquals(expectedCount, count);
    }

    private void assertXPathIntField(IndexTestSupport indexTestSupport) throws ParseException, IOException {
        Query q = NumericRangeQuery.newIntRange("nodecount", 6000, 20000, true, true);
        DocIdSetIterator iter = indexTestSupport.searcher.search(q);
        int count = 0;
        while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            ++count;
        }
        assertEquals(1, count);
    }

    private void assertXPathStringField(int expectedCount, String field, String term,
            IndexTestSupport indexTestSupport) throws ParseException, IOException {
        Query q = new TermQuery(new Term(field, term));
        DocIdSetIterator iter = indexTestSupport.searcher.search(q);
        int count = 0;
        while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            ++count;
        }
        assertEquals("Wrong number of matches for " + q.toString(), expectedCount, count);
    }

    @Test
    public void testIndexFullTextOneDoc() throws Exception {
        XmlIndexer indexer = new XmlIndexer(INDEX_FULLTEXT);
        IndexWriter indexWriter = indexer.newIndexWriter(dir);
        indexer.indexDocument(indexWriter, "/lux/reader-test.xml",
                getClass().getClassLoader().getResourceAsStream("lux/reader-test.xml"));
        indexWriter.close();
        System.out.println(
                String.format("indexed path-values for lux/reader-test.xml in %d bytes", dir.sizeInBytes()));
        IndexTestSupport.printAllTerms(dir, indexer);
        /*
        IndexTestSupport indexTestSupport = new IndexTestSupport ("lux/hamlet.xml", indexer, dir);
        assertFullTextQuery (indexTestSupport, "title", "TEST", 1);
        */
    }

    @Test
    public void testStoreBinary() throws Exception {
        XmlIndexer indexer = new XmlIndexer(STORE_DOCUMENT);
        IndexWriter indexWriter = indexer.newIndexWriter(dir);
        indexer.storeDocument(indexWriter, "/lux/compiler/test-module.xqy",
                getClass().getClassLoader().getResourceAsStream("lux/compiler/test-module.xqy"));
        indexWriter.close();
        System.out.println(String.format("indexed path-values for test-module.xqy in %d bytes", dir.sizeInBytes()));
    }

    @Test
    @Ignore
    public void testIndexPathValuesOneDoc() throws Exception {
        XmlIndexer indexer = new XmlIndexer(INDEX_PATHS | INDEX_VALUES);
        IndexWriter indexWriter = indexer.newIndexWriter(dir);
        indexer.indexDocument(indexWriter, "/lux/hamlet.xml",
                getClass().getClassLoader().getResourceAsStream("lux/hamlet.xml"));
        indexWriter.close();
        System.out.println(String.format("indexed path-values for hamlet.xml in %d bytes", dir.sizeInBytes()));
        // hamlet.xml = 288815 bytes; indexed in 215040 bytes seems ok??
        // printAllTerms(new IndexTestSupport(indexer, dir));
    }

    @Test
    public void testIndexPathValuesOnly() throws Exception {
        IndexTestSupport indexTestSupport = buildIndex("path-values", INDEX_PATHS | INDEX_VALUES | BUILD_DOCUMENT);
        assertTotalDocs();
        assertPathQuery(indexTestSupport);
    }

    @Test
    public void testIndexPathText() throws Exception {
        IndexTestSupport indexTestSupport = buildIndex("path-text", INDEX_PATHS | INDEX_FULLTEXT | BUILD_DOCUMENT);
        assertTotalDocs();
        assertPathQuery(indexTestSupport);
    }

    @Test
    public void testIndexQNameValues() throws Exception {
        buildIndex("qname-values and docs", INDEX_QNAMES | INDEX_VALUES | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testIndexQNameText() throws Exception {
        IndexTestSupport indexTestSupport = buildIndex("qname-text and docs",
                INDEX_QNAMES | INDEX_FULLTEXT | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertFullTextQuery(indexTestSupport, "PERSONA", "ROSENCRANTZ", 4);
        assertTotalDocs();
    }

    @Test
    public void testIndexQNameTextOnly() throws Exception {
        // IndexTestSupport indexTestSupport = 
        buildIndex("qname-text", INDEX_QNAMES | INDEX_FULLTEXT | BUILD_DOCUMENT);
        assertTotalDocs();
        // printAllTerms(indexTestSupport);
    }

    @Test
    public void testIndexPathValues() throws Exception {
        buildIndex("path-values and docs", INDEX_PATHS | INDEX_VALUES | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testIndexQNamesAndPaths() throws Exception {
        IndexTestSupport its = buildIndex("qnames and paths and docs",
                INDEX_QNAMES | INDEX_PATHS | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
        its.close();
        buildIndex("qnames and paths", INDEX_QNAMES | INDEX_PATHS | BUILD_DOCUMENT);
    }

    @Test
    public void testIndexQNamesAndPathsOnly() throws Exception {
        buildIndex("qnames and paths", INDEX_QNAMES | INDEX_PATHS | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testStoreDocuments() throws Exception {
        buildIndex("xml storage", STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testStoreBinaryDocs() throws Exception {
        buildIndex("xml binary storage", STORE_TINY_BINARY | STORE_DOCUMENT | BUILD_DOCUMENT);
        assertTotalDocs();
    }

    @Test
    public void testXPathIndexes() throws Exception {
        XmlIndexer indexer = new XmlIndexer(BUILD_DOCUMENT);
        indexer.getConfiguration()
                .addField(new XPathField("nodecount", "count(//node())", null, Store.NO, Type.INT));
        indexer.getConfiguration().addField(new XPathField("doctype", "name(/*)", null, Store.NO, Type.STRING));
        IndexTestSupport indexTestSupport = buildIndex("xpath", indexer);
        assertXPathIntField(indexTestSupport);
        assertXPathStringField(5, "doctype", "ACT", indexTestSupport);
        if (GATHER_TIMING) {
            for (int i = 0; i < 5; i++) {
                reset();
                indexTestSupport = buildIndex("xpath", indexer);
            }
        }
    }

    @Test
    public void testMultipleXPathIndexes() throws Exception {
        XmlIndexer indexer = new XmlIndexer(BUILD_DOCUMENT);
        // SCENE comes in as ACT/*[2] - immediately following TITLE
        // These can be encoded within a single XPath - we don't allow multiple indexes with the same name
        indexer.getConfiguration()
                .addField(new XPathField("x", "name(/*/*[2]),name(/*)", null, Store.NO, Type.STRING));
        IndexTestSupport indexTestSupport = buildIndex("xpath", indexer);
        assertXPathStringField(25, "x", "SCENE", indexTestSupport);
    }

    @Test
    public void testMultipleXPathIndexesFail() throws Exception {
        XmlIndexer indexer = new XmlIndexer(BUILD_DOCUMENT);
        // SCENE comes in as ACT/*[2] - immediately following TITLE
        indexer.getConfiguration().addField(new XPathField("x", "name(/*/*[2])", null, Store.NO, Type.STRING));
        try {
            indexer.getConfiguration().addField(new XPathField("x", "name(/*)", null, Store.NO, Type.STRING));
            assertTrue("expected exception not thrown", false);
        } catch (IllegalStateException e) {
            assertEquals("Duplicate field name: x", e.getMessage());
        }
    }

    @Test
    public void testXPathIndexNamespace() throws Exception {
        IndexConfiguration indexConfig = new IndexConfiguration();
        indexConfig.defineNamespaceMapping("", "");
        indexConfig.defineNamespaceMapping("x", "http://lux.net{test}");
        indexConfig.addField(new XPathField("title", "//x:title", new KeywordAnalyzer(), Store.NO, Type.STRING));
        XmlIndexer indexer = new XmlIndexer(indexConfig);
        IndexTestSupport indexTestSupport = new IndexTestSupport("lux/reader-test-ns.xml", indexer, dir);
        assertXPathStringField(2, "title", "TEST", indexTestSupport);
    }

    @Before
    public void setup() {
        dir = new RAMDirectory();
    }

    @After
    public void cleanup() {
        dir.close();
    }

    private IndexTestSupport buildIndex(String desc, int options)
            throws XMLStreamException, IOException, SaxonApiException {
        XmlIndexer indexer = new XmlIndexer(options);
        IndexTestSupport index = buildIndex(desc, indexer);
        if (GATHER_TIMING) {
            for (int i = 0; i < 3; i++) {
                reset();
                indexer = new XmlIndexer(options);
                index = buildIndex(desc, indexer);
            }
        }
        return index;
    }

    private IndexTestSupport buildIndex(String desc, XmlIndexer indexer)
            throws XMLStreamException, IOException, SaxonApiException {
        long t0 = System.currentTimeMillis();
        IndexTestSupport indexTestSupport = new IndexTestSupport("lux/hamlet.xml", indexer, dir);
        System.out.println(String.format("indexed %s in %d ms %d bytes", desc, (System.currentTimeMillis() - t0),
                dir.sizeInBytes()));
        return indexTestSupport;
    }

    @SuppressWarnings("unused")
    private void printAllTerms(IndexTestSupport indexTestSupport) throws Exception {
        indexTestSupport.printAllTerms();
    }

    private void assertTotalDocs() throws IOException {
        LuxSearcher searcher = new LuxSearcher(dir);
        DocIdSetIterator results = searcher.search(new MatchAllDocsQuery());
        int count = 0;
        while (results.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            ++count;
        }
        assertEquals(6641, count);
        /*
        */
        searcher.close();
    }

}

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */