Java tutorial
package lux; import static lux.index.IndexConfiguration.*; import static org.junit.Assert.*; import java.io.ByteArrayInputStream; import java.io.IOException; import javax.xml.stream.XMLStreamException; import lux.index.FieldRole; import lux.index.IndexConfiguration; import lux.index.XmlIndexer; import lux.index.field.FieldDefinition; import lux.index.field.FieldDefinition.Type; import lux.index.field.XPathField; import lux.query.parser.XmlQueryParser; import lux.search.LuxSearcher; import net.sf.saxon.s9api.SaxonApiException; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.surround.parser.ParseException; import org.apache.lucene.queryparser.surround.parser.QueryParser; import org.apache.lucene.queryparser.surround.query.BasicQueryFactory; import org.apache.lucene.queryparser.surround.query.SrndQuery; import org.apache.lucene.queryparser.xml.ParserException; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.RAMDirectory; import org.junit.After; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; /** * Measures space and time for different indexing options, and validates * indexing results. * * The timings are off; we'd need to run this repeatedly to avoid transient * startup effects which overwhelm the measurements for a single run. * * But the space numbers (in bytes) should be valid (from Directory.sizeInBytes()): * XML storage: 3664896 (3.5M) * qnames = 3692544 - 3664896 = 27648 = 0.75% * paths = 3717120 - 3664896 = 52224 = 1.4% * * After refactoring XmlField, etc: * XML storage: 2274304 why did this shrink so much? We're now using serializer instead * of JDOM outputter - could this all be whitespace from indentation or something? * qnames: 2302976 - 2274304 = 28672 = 1.3% * paths: 2328576 - 2274304 = 54272 = 2.4% * path-occurrences = 122880 = 5.1% * path-values alone: 755712 * path-values (w/docs): 2714624 - 2274304 = 19% * qname-values (as phrases): 2631680 - 2274304 = 357376 = 16% * qname-values (hashed into single tokens): 2542592 - 2274304 = 11.8% * qname-words w/o terminal tokens: 2683904 - 2274304 = 18% * qname-words + terminal tokens: 2786304 - 2274304 = 22% * full-text (with all nodes transparent) 3899392 - 2274304 = 1625088 = 71% (1940480 full text alone) * full-text (text only) 2673664 - 2274304 = 399360 = 18% * full-text (text plus all nodes opaque) 3068928 - 2274304 = 35% * */ public class IndexTest { private static final boolean GATHER_TIMING = false; private RAMDirectory dir; @Test public void testIndexPaths() throws Exception { buildIndex("paths and xml", INDEX_PATHS | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); } private void reset() { dir.close(); dir = new RAMDirectory(); } @Test public void testIndexPathsOnly() throws Exception { IndexTestSupport indexTestSupport = buildIndex("paths", INDEX_PATHS | BUILD_DOCUMENT); assertTotalDocs(); // printAllTerms(indexTestSupport); assertPathQuery(indexTestSupport); } @Test public void testIndexQNames() throws Exception { buildIndex("qnames and xml", INDEX_QNAMES | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testIndexQNamesOnly() throws Exception { buildIndex("qnames", INDEX_QNAMES | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testIndexPathOccurOnly() throws Exception { // IndexTestSupport indexTestSupport = buildIndex("path-occurrences", INDEX_PATHS | INDEX_EACH_PATH | BUILD_DOCUMENT); // printAllTerms(indexTestSupport); assertTotalDocs(); } @Test public void testIndexFullText() throws Exception { buildIndex("full-text", INDEX_FULLTEXT | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); // printAllTerms(indexTestSupport); } @Test public void testIndexFullTextOnly() throws Exception { //IndexTestSupport indexTestSupport = buildIndex("full-text-only", INDEX_FULLTEXT); assertTotalDocs(); //printAllTerms(indexTestSupport); } private void assertPathQuery(IndexTestSupport indexTestSupport) throws ParseException, IOException { SrndQuery q = new QueryParser().parse2("w(w({},\"ACT\"),\"SCENE\")"); Query q2 = q.makeLuceneQueryFieldNoBoost( indexTestSupport.indexer.getConfiguration().getFieldName(FieldRole.PATH), new BasicQueryFactory()); DocIdSetIterator iter = indexTestSupport.searcher.search(q2); int count = 0; while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ++count; } assertEquals(5, count); } private void assertFullTextQuery(IndexTestSupport indexTestSupport, String qName, String term, int expectedCount) throws IOException, ParserException { LuxSearcher searcher = indexTestSupport.searcher; XmlIndexer indexer = indexTestSupport.indexer; IndexConfiguration config = indexer.getConfiguration(); FieldDefinition field = config.getField(FieldRole.ELEMENT_TEXT); Query q = new XmlQueryParser(field.getName(), field.getAnalyzer()).parse(new ByteArrayInputStream( ("<QNameTextQuery fieldName=\"" + config.getFieldName(FieldRole.ELEMENT_TEXT) + "\" qName=\"" + qName + "\">" + term + "</QNameTextQuery>").getBytes())); DocIdSetIterator iter = searcher.search(q); int count = 0; while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ++count; } assertEquals(expectedCount, count); } private void assertXPathIntField(IndexTestSupport indexTestSupport) throws ParseException, IOException { Query q = NumericRangeQuery.newIntRange("nodecount", 6000, 20000, true, true); DocIdSetIterator iter = indexTestSupport.searcher.search(q); int count = 0; while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ++count; } assertEquals(1, count); } private void assertXPathStringField(int expectedCount, String field, String term, IndexTestSupport indexTestSupport) throws ParseException, IOException { Query q = new TermQuery(new Term(field, term)); DocIdSetIterator iter = indexTestSupport.searcher.search(q); int count = 0; while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ++count; } assertEquals("Wrong number of matches for " + q.toString(), expectedCount, count); } @Test public void testIndexFullTextOneDoc() throws Exception { XmlIndexer indexer = new XmlIndexer(INDEX_FULLTEXT); IndexWriter indexWriter = indexer.newIndexWriter(dir); indexer.indexDocument(indexWriter, "/lux/reader-test.xml", getClass().getClassLoader().getResourceAsStream("lux/reader-test.xml")); indexWriter.close(); System.out.println( String.format("indexed path-values for lux/reader-test.xml in %d bytes", dir.sizeInBytes())); IndexTestSupport.printAllTerms(dir, indexer); /* IndexTestSupport indexTestSupport = new IndexTestSupport ("lux/hamlet.xml", indexer, dir); assertFullTextQuery (indexTestSupport, "title", "TEST", 1); */ } @Test public void testStoreBinary() throws Exception { XmlIndexer indexer = new XmlIndexer(STORE_DOCUMENT); IndexWriter indexWriter = indexer.newIndexWriter(dir); indexer.storeDocument(indexWriter, "/lux/compiler/test-module.xqy", getClass().getClassLoader().getResourceAsStream("lux/compiler/test-module.xqy")); indexWriter.close(); System.out.println(String.format("indexed path-values for test-module.xqy in %d bytes", dir.sizeInBytes())); } @Test @Ignore public void testIndexPathValuesOneDoc() throws Exception { XmlIndexer indexer = new XmlIndexer(INDEX_PATHS | INDEX_VALUES); IndexWriter indexWriter = indexer.newIndexWriter(dir); indexer.indexDocument(indexWriter, "/lux/hamlet.xml", getClass().getClassLoader().getResourceAsStream("lux/hamlet.xml")); indexWriter.close(); System.out.println(String.format("indexed path-values for hamlet.xml in %d bytes", dir.sizeInBytes())); // hamlet.xml = 288815 bytes; indexed in 215040 bytes seems ok?? // printAllTerms(new IndexTestSupport(indexer, dir)); } @Test public void testIndexPathValuesOnly() throws Exception { IndexTestSupport indexTestSupport = buildIndex("path-values", INDEX_PATHS | INDEX_VALUES | BUILD_DOCUMENT); assertTotalDocs(); assertPathQuery(indexTestSupport); } @Test public void testIndexPathText() throws Exception { IndexTestSupport indexTestSupport = buildIndex("path-text", INDEX_PATHS | INDEX_FULLTEXT | BUILD_DOCUMENT); assertTotalDocs(); assertPathQuery(indexTestSupport); } @Test public void testIndexQNameValues() throws Exception { buildIndex("qname-values and docs", INDEX_QNAMES | INDEX_VALUES | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testIndexQNameText() throws Exception { IndexTestSupport indexTestSupport = buildIndex("qname-text and docs", INDEX_QNAMES | INDEX_FULLTEXT | STORE_DOCUMENT | BUILD_DOCUMENT); assertFullTextQuery(indexTestSupport, "PERSONA", "ROSENCRANTZ", 4); assertTotalDocs(); } @Test public void testIndexQNameTextOnly() throws Exception { // IndexTestSupport indexTestSupport = buildIndex("qname-text", INDEX_QNAMES | INDEX_FULLTEXT | BUILD_DOCUMENT); assertTotalDocs(); // printAllTerms(indexTestSupport); } @Test public void testIndexPathValues() throws Exception { buildIndex("path-values and docs", INDEX_PATHS | INDEX_VALUES | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testIndexQNamesAndPaths() throws Exception { IndexTestSupport its = buildIndex("qnames and paths and docs", INDEX_QNAMES | INDEX_PATHS | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); its.close(); buildIndex("qnames and paths", INDEX_QNAMES | INDEX_PATHS | BUILD_DOCUMENT); } @Test public void testIndexQNamesAndPathsOnly() throws Exception { buildIndex("qnames and paths", INDEX_QNAMES | INDEX_PATHS | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testStoreDocuments() throws Exception { buildIndex("xml storage", STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testStoreBinaryDocs() throws Exception { buildIndex("xml binary storage", STORE_TINY_BINARY | STORE_DOCUMENT | BUILD_DOCUMENT); assertTotalDocs(); } @Test public void testXPathIndexes() throws Exception { XmlIndexer indexer = new XmlIndexer(BUILD_DOCUMENT); indexer.getConfiguration() .addField(new XPathField("nodecount", "count(//node())", null, Store.NO, Type.INT)); indexer.getConfiguration().addField(new XPathField("doctype", "name(/*)", null, Store.NO, Type.STRING)); IndexTestSupport indexTestSupport = buildIndex("xpath", indexer); assertXPathIntField(indexTestSupport); assertXPathStringField(5, "doctype", "ACT", indexTestSupport); if (GATHER_TIMING) { for (int i = 0; i < 5; i++) { reset(); indexTestSupport = buildIndex("xpath", indexer); } } } @Test public void testMultipleXPathIndexes() throws Exception { XmlIndexer indexer = new XmlIndexer(BUILD_DOCUMENT); // SCENE comes in as ACT/*[2] - immediately following TITLE // These can be encoded within a single XPath - we don't allow multiple indexes with the same name indexer.getConfiguration() .addField(new XPathField("x", "name(/*/*[2]),name(/*)", null, Store.NO, Type.STRING)); IndexTestSupport indexTestSupport = buildIndex("xpath", indexer); assertXPathStringField(25, "x", "SCENE", indexTestSupport); } @Test public void testMultipleXPathIndexesFail() throws Exception { XmlIndexer indexer = new XmlIndexer(BUILD_DOCUMENT); // SCENE comes in as ACT/*[2] - immediately following TITLE indexer.getConfiguration().addField(new XPathField("x", "name(/*/*[2])", null, Store.NO, Type.STRING)); try { indexer.getConfiguration().addField(new XPathField("x", "name(/*)", null, Store.NO, Type.STRING)); assertTrue("expected exception not thrown", false); } catch (IllegalStateException e) { assertEquals("Duplicate field name: x", e.getMessage()); } } @Test public void testXPathIndexNamespace() throws Exception { IndexConfiguration indexConfig = new IndexConfiguration(); indexConfig.defineNamespaceMapping("", ""); indexConfig.defineNamespaceMapping("x", "http://lux.net{test}"); indexConfig.addField(new XPathField("title", "//x:title", new KeywordAnalyzer(), Store.NO, Type.STRING)); XmlIndexer indexer = new XmlIndexer(indexConfig); IndexTestSupport indexTestSupport = new IndexTestSupport("lux/reader-test-ns.xml", indexer, dir); assertXPathStringField(2, "title", "TEST", indexTestSupport); } @Before public void setup() { dir = new RAMDirectory(); } @After public void cleanup() { dir.close(); } private IndexTestSupport buildIndex(String desc, int options) throws XMLStreamException, IOException, SaxonApiException { XmlIndexer indexer = new XmlIndexer(options); IndexTestSupport index = buildIndex(desc, indexer); if (GATHER_TIMING) { for (int i = 0; i < 3; i++) { reset(); indexer = new XmlIndexer(options); index = buildIndex(desc, indexer); } } return index; } private IndexTestSupport buildIndex(String desc, XmlIndexer indexer) throws XMLStreamException, IOException, SaxonApiException { long t0 = System.currentTimeMillis(); IndexTestSupport indexTestSupport = new IndexTestSupport("lux/hamlet.xml", indexer, dir); System.out.println(String.format("indexed %s in %d ms %d bytes", desc, (System.currentTimeMillis() - t0), dir.sizeInBytes())); return indexTestSupport; } @SuppressWarnings("unused") private void printAllTerms(IndexTestSupport indexTestSupport) throws Exception { indexTestSupport.printAllTerms(); } private void assertTotalDocs() throws IOException { LuxSearcher searcher = new LuxSearcher(dir); DocIdSetIterator results = searcher.search(new MatchAllDocsQuery()); int count = 0; while (results.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ++count; } assertEquals(6641, count); /* */ searcher.close(); } } /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */