nl.inl.blacklab.search.Searcher.java Source code

Introduction

Here is the source code for nl.inl.blacklab.search.Searcher.java
Source

/*******************************************************************************
 * Copyright (c) 2010, 2012 Institute for Dutch Lexicology
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package nl.inl.blacklab.search;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.JarURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.jar.Attributes;
import java.util.jar.Manifest;

import nl.inl.blacklab.analysis.BLDutchAnalyzer;
import nl.inl.blacklab.analysis.BLNonTokenizingAnalyzer;
import nl.inl.blacklab.analysis.BLStandardAnalyzer;
import nl.inl.blacklab.analysis.BLWhitespaceAnalyzer;
import nl.inl.blacklab.externalstorage.ContentAccessorContentStore;
import nl.inl.blacklab.externalstorage.ContentStore;
import nl.inl.blacklab.externalstorage.ContentStoreDir;
import nl.inl.blacklab.externalstorage.ContentStoreDirAbstract;
import nl.inl.blacklab.externalstorage.ContentStoreDirUtf8;
import nl.inl.blacklab.externalstorage.ContentStoreDirZip;
import nl.inl.blacklab.forwardindex.ForwardIndex;
import nl.inl.blacklab.forwardindex.Terms;
import nl.inl.blacklab.highlight.XmlHighlighter;
import nl.inl.blacklab.highlight.XmlHighlighter.HitCharSpan;
import nl.inl.blacklab.highlight.XmlHighlighter.UnbalancedTagsStrategy;
import nl.inl.blacklab.index.complex.ComplexFieldUtil;
import nl.inl.blacklab.perdocument.DocResults;
import nl.inl.blacklab.search.indexstructure.ComplexFieldDesc;
import nl.inl.blacklab.search.indexstructure.IndexStructure;
import nl.inl.blacklab.search.indexstructure.MetadataFieldDesc;
import nl.inl.blacklab.search.indexstructure.PropertyDesc;
import nl.inl.blacklab.search.lucene.SpanQueryFiltered;
import nl.inl.blacklab.search.lucene.TextPatternTranslatorSpanQuery;
import nl.inl.util.ExUtil;
import nl.inl.util.LuceneUtil;
import nl.inl.util.Utilities;
import nl.inl.util.VersionFile;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Bits;

/**
 * The main interface into the BlackLab library. The Searcher object is instantiated with an open
 * Lucene IndexReader and accesses that index through special methods.
 *
 * The Searcher object knows how to access the original contents of indexed fields, either because
 * the field is a stored field in the Lucene index, or because it knows where else the content can
 * be found (such as in fixed-length-encoding files, for fast random access).
 *
 * Searcher is thread-safe: a single instance may be shared to perform a number of simultaneous
 * searches.
 */
public class Searcher {

    protected static final Logger logger = Logger.getLogger(Searcher.class);

    /** Complex field name for default contents field */
    public static final String DEFAULT_CONTENTS_FIELD_NAME = "contents";

    /** Whether or not to automatically warm up the forward indices in a background thread at startup */
    private static boolean autoWarmForwardIndices = false;

    /** The collator to use for sorting. Defaults to English collator. */
    private static Collator defaultCollator = Collator.getInstance(new Locale("en", "GB"));

    /** Analyzer based on WhitespaceTokenizer */
    private static BLWhitespaceAnalyzer whitespaceAnalyzer;

    /** Analyzer for Dutch and other Latin script languages */
    private static BLDutchAnalyzer defaultAnalyzer;

    /** Analyzer based on StandardTokenizer */
    private static BLStandardAnalyzer standardAnalyzer;

    /** Analyzer that doesn't tokenize */
    private static BLNonTokenizingAnalyzer nonTokenizingAnalyzer;

    static {
        // Create the various analyzer objects we'll be using for metadata fields.
        whitespaceAnalyzer = new BLWhitespaceAnalyzer();
        defaultAnalyzer = new BLDutchAnalyzer();
        standardAnalyzer = new BLStandardAnalyzer();
        nonTokenizingAnalyzer = new BLNonTokenizingAnalyzer();
    }

    /** The collator to use for sorting. Defaults to English collator. */
    private Collator collator = defaultCollator;

    /**
     * ContentAccessors tell us how to get a field's content:
     * <ol>
     * <li>if there is no contentaccessor: get it from the Lucene index (stored field)</li>
     * <li>from an external source (file, database) if it's not (because the content is very large
     * and/or we want faster random access to the content than a stored field can provide)</li>
     * </ol>
     *
     * Indexed by complex field name.
     */
    private Map<String, ContentAccessor> contentAccessors = new HashMap<String, ContentAccessor>();

    /**
     * ForwardIndices allow us to quickly find what token occurs at a specific position. This speeds
     * up grouping and sorting. There may be several indices on a complex field, e.g.: word form,
     * lemma, part of speech.
     *
     * Indexed by property name.
     */
    private Map<String, ForwardIndex> forwardIndices = new HashMap<String, ForwardIndex>();

    /**
     * The Lucene index reader
     */
    private DirectoryReader reader;

    /**
     * The Lucene IndexSearcher, for dealing with non-Span queries (for per-document scoring)
     */
    private IndexSearcher indexSearcher;

    /**
     * Name of the main contents field (used as default parameter value for many methods)
     */
    public String mainContentsFieldName;

    /** Default number of words around a hit */
    private int defaultContextSize = 5;

    /** Should we default to case-sensitive searching? [false] */
    private boolean defaultCaseSensitive = false;

    /** Should we default to diacritics-sensitive searching? [false] */
    private boolean defaultDiacriticsSensitive = false;

    /**
     * Directory where our index resides
     */
    private File indexLocation;

    /** Structure of our index */
    private IndexStructure indexStructure;

    /** Do we want to retrieve concordances from the forward index instead of from the
     *  content store? Generating them from the forward index is more
     *  efficient.
     *
     *  This is set to true for all modern indices.
     *  (to be precise, it's set to true iff a punctuation forward index is present)
     *
     *  This setting controls the default. You don't have to set this to false if you
     *  sometimes want concordances from the content store; you can specifically request
     *  those when you need them.
     */
    private ConcordanceType defaultConcsType = ConcordanceType.CONTENT_STORE;
    //private boolean concordancesFromForwardIndex = false;

    /** Forward index to use as text context of &lt;w/&gt; tags in concordances (words; null = no text content) */
    private String concWordFI = "word";

    /** Forward index to use as text context between &lt;w/&gt; tags in concordances (punctuation+whitespace; null = just a space) */
    private String concPunctFI = ComplexFieldUtil.PUNCTUATION_PROP_NAME;

    /** Forward indices to use as attributes of &lt;w/&gt; tags in concordances (null = the rest) */
    private Collection<String> concAttrFI = null; // all other FIs are attributes

    /** How we fix well-formedness for snippets of XML: by adding or removing unbalanced tags */
    private UnbalancedTagsStrategy defaultUnbalancedTagsStrategy = UnbalancedTagsStrategy.ADD_TAG;

    /**
     * How do we fix well-formedness for snippets of XML?
     * @return the setting: either adding or removing unbalanced tags
     */
    public UnbalancedTagsStrategy getDefaultUnbalancedTagsStrategy() {
        return defaultUnbalancedTagsStrategy;
    }

    /**
     * Set how to fix well-formedness for snippets of XML.
     * @param strategy the setting: either adding or removing unbalanced tags
     */
    public void setDefaultUnbalancedTagsStrategy(UnbalancedTagsStrategy strategy) {
        this.defaultUnbalancedTagsStrategy = strategy;
    }

    /**
     * Are we making concordances using the forward index (true) or using
     * the content store (false)? Forward index is more efficient but returns
     * concordances that don't include XML tags.
     *
     * @return true iff we use the forward index for making concordances.
     * @deprecated use getDefaultConcordanceType
     */
    @Deprecated
    public boolean getMakeConcordancesFromForwardIndex() {
        return getDefaultConcordanceType() == ConcordanceType.FORWARD_INDEX;
    }

    public ConcordanceType getDefaultConcordanceType() {
        return defaultConcsType;
    }

    public void setDefaultConcordanceType(ConcordanceType type) {
        defaultConcsType = type;
    }

    /**
     * Do our concordances include the original XML tags, or are they stripped out?
     *
     * @return true iff our concordances include XML tags.
     * @deprecated always returns true now
     */
    @Deprecated
    public boolean concordancesIncludeXmlTags() {
        return true;
    }

    /**
     * Do we want to retrieve concordances from the forward index instead of from the
     * content store? This may be more efficient, particularly for small result sets
     * (because it eliminates seek time and decompression time), but concordances won't
     * include XML tags.
     *
     * Also, if there is no punctuation forward index ("punct"), concordances won't include
     * punctuation.
     *
     * @param concordancesFromForwardIndex true if we want to use the forward index to make
     * concordances.
     * @deprecated use setDefaultConcordanceType()
     */
    @Deprecated
    public void setMakeConcordancesFromForwardIndex(boolean concordancesFromForwardIndex) {
        setDefaultConcordanceType(
                concordancesFromForwardIndex ? ConcordanceType.FORWARD_INDEX : ConcordanceType.CONTENT_STORE);
    }

    /** If true, we want to add/delete documents. If false, we're just searching. */
    private boolean indexMode = false;

    /** If true, we've just created a new index. New indices cannot be searched, only added to. */
    private boolean isEmptyIndex = false;

    /** The index writer. Only valid in indexMode. */
    private IndexWriter indexWriter = null;

    /** Thread that automatically warms up the forward indices, if enabled. */
    private Thread autoWarmThread;

    /** Analyzer used for indexing our metadata fields */
    private Analyzer analyzer;

    /**
     * Open an index for writing ("index mode": adding/deleting documents).
     *
     * Note that in index mode, searching operations may not take the latest
     * changes into account. It is wisest to only use index mode for indexing,
     * then close the Searcher and create a regular one for searching.
     *
     * @param indexDir the index directory
     * @param createNewIndex if true, create a new index even if one existed there
     * @return the searcher in index mode
     * @throws IOException
     */
    public static Searcher openForWriting(File indexDir, boolean createNewIndex) throws IOException {
        return new Searcher(indexDir, true, createNewIndex, (File) null);
    }

    /**
     * Open an index for writing ("index mode": adding/deleting documents).
     *
     * Note that in index mode, searching operations may not take the latest
     * changes into account. It is wisest to only use index mode for indexing,
     * then close the Searcher and create a regular one for searching.
     *
     * @param indexDir the index directory
     * @param createNewIndex if true, create a new index even if one existed there
     * @param indexTemplateFile JSON template to use for index structure / metadata
     * @return the searcher in index mode
     * @throws IOException
     */
    public static Searcher openForWriting(File indexDir, boolean createNewIndex, File indexTemplateFile)
            throws IOException {
        return new Searcher(indexDir, true, createNewIndex, indexTemplateFile);
    }

    /**
     * Create an empty index.
     *
     * @param indexDir where to create the index
     * @return a Searcher for the new index, in index mode
     * @throws IOException
     */
    public static Searcher createIndex(File indexDir) throws IOException {
        return createIndex(indexDir, null, null, false);
    }

    /**
     * Create an empty index.
     *
     * @param indexDir where to create the index
     * @param displayName the display name for the new index, or null to
     *   assign one automatically (based on the directory name)
     * @return a Searcher for the new index, in index mode
     * @throws IOException
     */
    public static Searcher createIndex(File indexDir, String displayName) throws IOException {
        return createIndex(indexDir, displayName, null, false);
    }

    /**
     * Create an empty index.
     *
     * @param indexDir where to create the index
     * @param displayName the display name for the new index, or null to
     *   assign one automatically (based on the directory name)
     * @param documentFormat a format identifier to store as the document format,
     *   or null for none. See the DocumentFormats class.
     * @param contentViewable is viewing of the document contents allowed?
     * @return a Searcher for the new index, in index mode
     * @throws IOException
     */
    public static Searcher createIndex(File indexDir, String displayName, String documentFormat,
            boolean contentViewable) throws IOException {
        Searcher rv = openForWriting(indexDir, true);
        if (displayName != null && displayName.length() > 0) {
            rv.getIndexStructure().setDisplayName(displayName);
        }
        if (documentFormat != null) {
            rv.getIndexStructure().setDocumentFormat(documentFormat);
        }
        rv.getIndexStructure().setContentViewable(contentViewable);
        rv.getIndexStructure().writeMetadata();
        return rv;
    }

    /**
     * Open an index for reading ("search mode").
     *
     * @param indexDir the index directory
     * @return the searcher
     * @throws CorruptIndexException
     * @throws IOException
     */
    public static Searcher open(File indexDir) throws CorruptIndexException, IOException {
        return new Searcher(indexDir, false, false, (File) null);
    }

    /**
     * Open an index.
     *
     * @param indexDir the index directory
     * @param indexMode if true, open in index mode; if false, open in search mode.
     * @param createNewIndex if true, delete existing index in this location if it exists.
     * @param indexTemplateFile JSON file to use as template for index structure / metadata
     *   (if creating new index)
     * @throws IOException
     */
    private Searcher(File indexDir, boolean indexMode, boolean createNewIndex, File indexTemplateFile)
            throws IOException {
        this.indexMode = indexMode;

        if (!indexMode && createNewIndex)
            throw new RuntimeException("Cannot create new index, not in index mode");

        if (!createNewIndex) {
            if (!indexMode || VersionFile.exists(indexDir)) {
                if (!isIndex(indexDir)) {
                    throw new RuntimeException(
                            "BlackLab index has wrong type or version! " + VersionFile.report(indexDir));
                }
            }
        }

        logger.debug("Constructing Searcher...");

        if (indexMode) {
            indexWriter = openIndexWriter(indexDir, createNewIndex);
            reader = DirectoryReader.open(indexWriter, false);
        } else {
            // Open Lucene index
            reader = DirectoryReader.open(FSDirectory.open(indexDir));
        }
        this.indexLocation = indexDir;

        // Determine the index structure
        indexStructure = new IndexStructure(reader, indexDir, createNewIndex, indexTemplateFile);
        isEmptyIndex = indexStructure.isNewIndex();

        createAnalyzers();

        // Detect and open the ContentStore for the contents field
        if (!createNewIndex) {
            ComplexFieldDesc mainContentsField = indexStructure.getMainContentsField();
            if (mainContentsField == null) {
                if (!indexMode) {
                    if (!isEmptyIndex)
                        throw new RuntimeException("Could not detect main contents field");

                    // Empty index. Set a default name for the contents field.
                    // Searching an empty index will fail and should not be attempted.
                    this.mainContentsFieldName = Searcher.DEFAULT_CONTENTS_FIELD_NAME;
                }
            } else {
                this.mainContentsFieldName = mainContentsField.getName();

                // See if we have a punctuation forward index. If we do,
                // default to creating concordances using that.
                if (mainContentsField.hasPunctuation()) {
                    defaultConcsType = ConcordanceType.FORWARD_INDEX;
                }
            }

            // Register content stores
            for (String cfn : indexStructure.getComplexFields()) {
                if (indexStructure.getComplexFieldDesc(cfn).hasContentStore()) {
                    File dir = new File(indexDir, "cs_" + cfn);
                    if (!dir.exists()) {
                        dir = new File(indexDir, "xml"); // OLD, should eventually be removed
                    }
                    if (dir.exists()) {
                        registerContentStore(cfn, openContentStore(dir));
                    }
                }
            }
        }

        indexSearcher = new IndexSearcher(reader);

        // Make sure large wildcard/regex expansions succeed
        BooleanQuery.setMaxClauseCount(100000);

        // Open the forward indices
        if (!createNewIndex)
            openForwardIndices();
        logger.debug("Done.");
    }

    /**
     * Is this a newly created, empty index?
     * @return true if it is, false if not
     */
    public boolean isEmpty() {
        return isEmptyIndex;
    }

    /**
     * Does the specified directory contain a BlackLab index?
     * @param indexDir the directory
     * @return true if it's a BlackLab index, false if not.
     */
    public static boolean isIndex(File indexDir) {
        try {
            if (VersionFile.exists(indexDir)) {
                VersionFile vf = VersionFile.read(indexDir);
                String version = vf.getVersion();
                if (vf.getType().equals("blacklab") && (version.equals("1") || version.equals("2")))
                    return true;
            }
            return false;
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Open an index.
     *
     * @param indexDir the index directory
     * @param indexMode if true, open in index mode; if false, open in search mode.
     * @param createNewIndex if true, delete existing index in this location if it exists.
     * @throws CorruptIndexException
     * @throws IOException
     */
    private Searcher(File indexDir, boolean indexMode, boolean createNewIndex)
            throws CorruptIndexException, IOException {
        this(indexDir, indexMode, createNewIndex, (File) null);
    }

    private void createAnalyzers() {
        Map<String, Analyzer> fieldAnalyzers = new HashMap<String, Analyzer>();
        fieldAnalyzers.put("fromInputFile", getAnalyzerInstance("nontokenizing"));
        Analyzer baseAnalyzer = getAnalyzerInstance(indexStructure.getDefaultAnalyzerName());
        for (String fieldName : indexStructure.getMetadataFields()) {
            MetadataFieldDesc fd = indexStructure.getMetadataFieldDesc(fieldName);
            String analyzerName = fd.getAnalyzerName();
            if (analyzerName.length() > 0 && !analyzerName.equalsIgnoreCase("DEFAULT")) {
                Analyzer fieldAnalyzer = getAnalyzerInstance(analyzerName);
                if (fieldAnalyzer == null) {
                    logger.error("Unknown analyzer name " + analyzerName + " for field " + fieldName);
                } else {
                    if (fieldAnalyzer != baseAnalyzer)
                        fieldAnalyzers.put(fieldName, fieldAnalyzer);
                }
            }
        }

        analyzer = new PerFieldAnalyzerWrapper(baseAnalyzer, fieldAnalyzers);
    }

    /**
     * Construct a Searcher object, the main search interface on a BlackLab index.
     *
     * @param indexDir
     *            the index directory
     * @throws CorruptIndexException
     * @throws IOException
     * @deprecated use Searcher.open(File)
     */
    @Deprecated
    public Searcher(File indexDir) throws CorruptIndexException, IOException {
        this(indexDir, false, false);
    }

    /**
     * Call this to roll back any changes made to the index this session.
     * Calling close() will automatically commit any changes. If you call this
     * method, then call close(), no changes will be committed.
     */
    public void rollback() {
        try {
            indexWriter.rollback();
            indexWriter = null;
        } catch (IOException e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Finalize the Searcher object. This closes the IndexSearcher and (depending on the constructor
     * used) may also close the index reader.
     */
    public void close() {
        try {
            reader.close();
            if (indexWriter != null) {
                indexWriter.commit();
                indexWriter.close();
            }

            // See if the forward index warmup thread is running, and if so, stop it
            if (autoWarmThread != null && autoWarmThread.isAlive()) {
                autoWarmThread.interrupt();

                // Wait for a maximum of a second for the thread to close down gracefully
                int i = 0;
                while (autoWarmThread.isAlive() && i < 10) {
                    try {
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                        // OK
                    }
                    i++;
                }
            }

            // Close the forward indices
            for (ForwardIndex fi : forwardIndices.values()) {
                fi.close();
            }

            // Close the content accessor(s)
            // (the ContentStore, and possibly other content accessors
            // (although that feature is not used right now))
            for (ContentAccessor ca : contentAccessors.values()) {
                ca.close();
            }

        } catch (IOException e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Get information about the structure of the BlackLab index.
     *
     * @return the structure object
     */
    public IndexStructure getIndexStructure() {
        return indexStructure;
    }

    /**
     * Retrieve a Lucene Document object from the index.
     *
     * NOTE: you must check if the document isn't deleted using Search.isDeleted()
     * first! Lucene 4.0+ allows you to retrieve deleted documents, making you
     * responsible for checking whether documents are deleted or not.
     *
     * @param doc
     *            the document id
     * @return the Lucene Document
     * @throws RuntimeException if the document doesn't exist (use maxDoc() and isDeleted() to check first!)
     */
    public Document document(int doc) {
        try {
            if (doc < 0)
                throw new RuntimeException("Negative document id");
            if (doc >= reader.maxDoc())
                throw new RuntimeException("Document id >= maxDoc");
            return reader.document(doc);
        } catch (Exception e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Checks if a document has been deleted from the index
     * @param doc the document id
     * @return true iff it has been deleted
     */
    public boolean isDeleted(int doc) {
        Bits liveDocs = MultiFields.getLiveDocs(reader);
        return liveDocs != null && !liveDocs.get(doc);
    }

    /**
     * Returns one more than the highest document id
     * @return one more than the highest document id
     */
    public int maxDoc() {
        return reader.maxDoc();
    }

    public SpanQuery filterDocuments(SpanQuery query, Filter filter) {
        return new SpanQueryFiltered(query, filter);
    }

    public SpanQuery createSpanQuery(TextPattern pattern, String fieldName, Filter filter) {
        // Convert to SpanQuery
        pattern = pattern.rewrite();
        TextPatternTranslatorSpanQuery spanQueryTranslator = new TextPatternTranslatorSpanQuery();
        SpanQuery spanQuery = pattern.translate(spanQueryTranslator, getDefaultExecutionContext(fieldName));
        if (filter != null)
            spanQuery = new SpanQueryFiltered(spanQuery, filter);
        return spanQuery;
    }

    public SpanQuery createSpanQuery(TextPattern pattern, Filter filter) {
        return createSpanQuery(pattern, mainContentsFieldName, filter);
    }

    public SpanQuery createSpanQuery(TextPattern pattern, String fieldName) {
        return createSpanQuery(pattern, fieldName, (Filter) null);
    }

    public SpanQuery createSpanQuery(TextPattern pattern) {
        return createSpanQuery(pattern, mainContentsFieldName, (Filter) null);
    }

    /**
     * Find hits for a pattern in a field.
     *
     * @param query
     *            the pattern to find
     * @param fieldNameConc
     *            field to use for concordances
     * @return the hits found
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Hits find(SpanQuery query, String fieldNameConc) throws BooleanQuery.TooManyClauses {
        return new Hits(this, fieldNameConc, query);
    }

    /**
     * Find hits for a pattern in a field.
     *
     * @param query
     *            the pattern to find
     * @return the hits found
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Hits find(SpanQuery query) throws BooleanQuery.TooManyClauses {
        return new Hits(this, mainContentsFieldName, query);
    }

    /**
     * Find hits for a pattern in a field.
     *
     * @param pattern
     *            the pattern to find
     * @param fieldName
     *            field to find pattern in
     * @param filter
     *            determines which documents to search
     *
     * @return the hits found
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Hits find(TextPattern pattern, String fieldName, Filter filter) throws BooleanQuery.TooManyClauses {
        return new Hits(this, fieldName, createSpanQuery(pattern, fieldName, filter));
    }

    /**
     * Find hits for a pattern and filter them.
     *
     * @param pattern
     *            the pattern to find
     * @param filter
     *            determines which documents to search
     *
     * @return the hits found
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Hits find(TextPattern pattern, Filter filter) throws BooleanQuery.TooManyClauses {
        return find(pattern, mainContentsFieldName, filter);
    }

    /**
     * Find hits for a pattern in a field.
     *
     * @param pattern
     *            the pattern to find
     * @param fieldName
     *            which field to find the pattern in
     *
     * @return the hits found
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Hits find(TextPattern pattern, String fieldName) throws BooleanQuery.TooManyClauses {
        return find(pattern, fieldName, null);
    }

    /**
     * Find hits for a pattern.
     *
     * @param pattern
     *            the pattern to find
     *
     * @return the hits found
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Hits find(TextPattern pattern) throws BooleanQuery.TooManyClauses {
        return find(pattern, mainContentsFieldName, null);
    }

    /**
     * Find matching documents and their scores for a pattern.
     *
     * You can pass in both a SpanQuery or a regular Query.
     *
     * @param q
     * @return object that can iterate over matching docs and provide their scores. NOTE: null can
     *         be returned if there were no matches!
     * @throws BooleanQuery.TooManyClauses
     *             if a wildcard or regular expression term is overly broad
     */
    public Scorer findDocScores(Query q) {
        try {
            Weight w = indexSearcher.createNormalizedWeight(q);
            @SuppressWarnings("resource") // Don't close SCRW because we don't want to close our reader
            AtomicReader scrw = new SlowCompositeReaderWrapper(reader);
            Scorer sc = w.scorer(scrw.getContext(), true, false, MultiFields.getLiveDocs(reader));
            return sc;
        } catch (IOException e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Find the top-scoring documents.
     *
     * @param q
     *            the query
     *
     * @param n
     *            number of top documents to return
     * @return the documents
     */
    public TopDocs findTopDocs(Query q, int n) {
        try {
            return indexSearcher.search(q, n);
        } catch (IOException e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Get character positions from word positions.
     *
     * Places character positions in the same arrays as the word positions were specified in.
     *
     * @param doc
     *            the document from which to find character positions
     * @param fieldName
     *            the field from which to find character positions
     * @param startsOfWords
     *            word positions for which we want starting character positions (i.e. the position
     *            of the first letter of that word)
     * @param endsOfWords
     *            word positions for which we want ending character positions (i.e. the position of
     *            the last letter of that word)
     * @param fillInDefaultsIfNotFound
     *            if true, if any illegal word positions are specified (say, past the end of the
     *            document), a sane default value is chosen (in this case, the last character of the
     *            last word found). Otherwise, throws an exception.
     */
    void getCharacterOffsets(int doc, String fieldName, int[] startsOfWords, int[] endsOfWords,
            boolean fillInDefaultsIfNotFound) {

        if (startsOfWords.length == 0)
            return; // nothing to do
        try {
            // Determine lowest and highest word position we'd like to know something about.
            // This saves a little bit of time for large result sets.
            int minP = -1, maxP = -1;
            int numStarts = startsOfWords.length;
            int numEnds = endsOfWords.length;
            for (int i = 0; i < numStarts; i++) {
                if (startsOfWords[i] < minP || minP == -1)
                    minP = startsOfWords[i];
                if (startsOfWords[i] > maxP)
                    maxP = startsOfWords[i];
            }
            for (int i = 0; i < numEnds; i++) {
                if (endsOfWords[i] < minP || minP == -1)
                    minP = endsOfWords[i];
                if (endsOfWords[i] > maxP)
                    maxP = endsOfWords[i];
            }
            if (minP < 0 || maxP < 0)
                throw new RuntimeException("Can't determine min and max positions");

            String fieldPropName = ComplexFieldUtil.mainPropertyOffsetsField(indexStructure, fieldName);

            org.apache.lucene.index.Terms terms = reader.getTermVector(doc, fieldPropName);
            if (terms == null)
                throw new RuntimeException("Field " + fieldPropName + " in doc " + doc + " has no term vector");
            if (!terms.hasPositions())
                throw new RuntimeException(
                        "Field " + fieldPropName + " in doc " + doc + " has no character postion information");

            //int lowestPos = -1, highestPos = -1;
            int lowestPosFirstChar = -1, highestPosLastChar = -1;
            int total = numStarts + numEnds;
            boolean[] done = new boolean[total]; // NOTE: array is automatically initialized to zeroes!
            int found = 0;

            // Iterate over terms
            TermsEnum termsEnum = terms.iterator(null);
            while (termsEnum.next() != null) {
                DocsAndPositionsEnum dpe = termsEnum.docsAndPositions(null, null);

                // Iterate over docs containing this term (NOTE: should be only one doc!)
                while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                    int position = -1;

                    // Iterate over positions of this term in this doc
                    int positionsRead = 0;
                    int numberOfPositions = dpe.freq();
                    while (positionsRead < numberOfPositions) {
                        position = dpe.nextPosition();
                        if (position == -1)
                            break;
                        positionsRead++;

                        // Keep track of the lowest and highest char pos, so
                        // we can fill in the character positions we didn't find
                        int startOffset = dpe.startOffset();
                        if (startOffset < lowestPosFirstChar || lowestPosFirstChar == -1) {
                            lowestPosFirstChar = startOffset;
                        }
                        int endOffset = dpe.endOffset();
                        if (endOffset > highestPosLastChar) {
                            highestPosLastChar = endOffset;
                        }

                        // We've calculated the min and max word positions in advance, so
                        // we know we can skip this position if it's outside the range we're interested in.
                        // (Saves a little time for large result sets)
                        if (position < minP || position > maxP) {
                            continue;
                        }

                        for (int m = 0; m < numStarts; m++) {
                            if (!done[m] && position == startsOfWords[m]) {
                                done[m] = true;
                                startsOfWords[m] = startOffset;
                                found++;
                            }
                        }
                        for (int m = 0; m < numEnds; m++) {
                            if (!done[numStarts + m] && position == endsOfWords[m]) {
                                done[numStarts + m] = true;
                                endsOfWords[m] = endOffset;
                                found++;
                            }
                        }

                        // NOTE: we might be tempted to break here if found == total,
                        // but that would foul up our calculation of highestPosLastChar and
                        // lowestPosFirstChar.
                    }
                }

            }
            if (found < total) {
                if (!fillInDefaultsIfNotFound)
                    throw new RuntimeException("Could not find all character offsets!");

                if (lowestPosFirstChar < 0 || highestPosLastChar < 0)
                    throw new RuntimeException("Could not find default char positions!");

                for (int m = 0; m < numStarts; m++) {
                    if (!done[m])
                        startsOfWords[m] = lowestPosFirstChar;
                }
                for (int m = 0; m < numEnds; m++) {
                    if (!done[numStarts + m])
                        endsOfWords[m] = highestPosLastChar;
                }
            }

        } catch (IOException e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Get character positions from a list of hits.
     *
     * @param doc
     *            the document from which to find character positions
     * @param fieldName
     *            the field from which to find character positions
     * @param hits
     *            the hits for which we wish to find character positions
     * @return a list of HitSpan objects containing the character positions for the hits.
     */
    private List<HitCharSpan> getCharacterOffsets(int doc, String fieldName, Hits hits) {
        int[] starts = new int[hits.size()];
        int[] ends = new int[hits.size()];
        Iterator<Hit> hitsIt = hits.iterator();
        for (int i = 0; i < starts.length; i++) {
            Hit hit = hitsIt.next(); // hits.get(i);
            starts[i] = hit.start;
            ends[i] = hit.end - 1; // end actually points to the first word not in the hit, so
            // subtract one
        }

        getCharacterOffsets(doc, fieldName, starts, ends, true);

        List<HitCharSpan> hitspans = new ArrayList<HitCharSpan>(starts.length);
        for (int i = 0; i < starts.length; i++) {
            hitspans.add(new HitCharSpan(starts[i], ends[i]));
        }
        return hitspans;
    }

    public DocContentsFromForwardIndex getContentFromForwardIndex(int docId, String fieldName, int startAtWord,
            int endAtWord) {
        // FIXME: use fieldName
        Hit hit = new Hit(docId, startAtWord, endAtWord);
        Hits hits = new Hits(this, Arrays.asList(hit));
        Kwic kwic = hits.getKwic(hit, 0);
        return kwic.getDocContents();
    }

    /**
     * Get part of the contents of a field from a Lucene Document.
     *
     * This takes into account that some fields are stored externally in content stores
     * instead of in the Lucene index.
     *
     * @param docId
     *            the Lucene Document id
     * @param fieldName
     *            the name of the field
     * @param startAtWord where to start getting the content (-1 for start of document, 0 for first word)
     * @param endAtWord where to end getting the content (-1 for end of document)
     * @return the field content
     */
    public String getContent(int docId, String fieldName, int startAtWord, int endAtWord) {
        Document d = document(docId);
        ContentAccessor ca = contentAccessors.get(fieldName);
        if (ca == null) {
            // No special content accessor set; assume a stored field
            String content = d.get(fieldName);
            if (content == null)
                throw new RuntimeException("Field not found: " + fieldName);
            return getWordsFromString(content, startAtWord, endAtWord);
        }

        int[] startEnd = startEndWordToCharPos(docId, fieldName, startAtWord, endAtWord);
        return ca.getSubstringFromDocument(d, startEnd[0], startEnd[1]);
    }

    /**
     * Convert start/end word positions to char positions.
     *
     * @param docId     Lucene Document id
     * @param fieldName name of the field
     * @param startAtWord where to start getting the content (-1 for start of document, 0 for first word)
     * @param endAtWord where to end getting the content (-1 for end of document)
     * @return the start and end char position as a two element int array
     *   (with any -1's preserved)
     */
    private int[] startEndWordToCharPos(int docId, String fieldName, int startAtWord, int endAtWord) {
        if (startAtWord == -1 && endAtWord == -1) {
            // No need to translate anything
            return new int[] { -1, -1 };
        }

        // Translate word pos to char pos and retrieve content
        // NOTE: this boolean stuff is a bit iffy, but is necessary because
        // getCharacterOffsets doesn't handle -1 to mean start/end of doc.
        // We should probably fix that some time.
        boolean startAtStartOfDoc = startAtWord == -1;
        boolean endAtEndOfDoc = endAtWord == -1;
        int[] starts = new int[] { startAtStartOfDoc ? 0 : startAtWord };
        int[] ends = new int[] { endAtEndOfDoc ? starts[0] : endAtWord };
        getCharacterOffsets(docId, fieldName, starts, ends, true);
        if (startAtStartOfDoc)
            starts[0] = -1;
        if (endAtEndOfDoc)
            ends[0] = -1;
        int[] startEnd = new int[] { starts[0], ends[0] };
        return startEnd;
    }

    /**
     * Get part of the contents of a field from a Lucene Document.
     *
     * This takes into account that some fields are stored externally in content stores
     * instead of in the Lucene index.
     *
     * @param docId
     *            the Lucene Document id
     * @param fieldName
     *            the name of the field
     * @param startAtChar where to start getting the content (-1 for start of document, 0 for first char)
     * @param endAtChar where to end getting the content (-1 for end of document)
     * @return the field content
     */
    public String getContentByCharPos(int docId, String fieldName, int startAtChar, int endAtChar) {
        Document d = document(docId);
        ContentAccessor ca = contentAccessors.get(fieldName);
        if (ca == null) {
            // No special content accessor set; assume a stored field
            return d.get(fieldName).substring(startAtChar, endAtChar);
        }

        return ca.getSubstringFromDocument(d, startAtChar, endAtChar);
    }

    /**
     * Cut a few words from a string.
     *
     * Note, this just splits on whitespace and glues words
     * back with space. Might not work very well in all cases,
     * but it's not likely to be used anyway (we generally don't
     * cut a few words from a metadata field).
     *
     * @param content the string to cut from
     * @param startAtWord first word to include
     * @param endAtWord first word not to include
     * @return the cut string
     */
    private String getWordsFromString(String content, int startAtWord, int endAtWord) {
        if (startAtWord == -1 && endAtWord == -1)
            return content;
        // We want specific words from the field; quick-n-dirty way to do this
        // (will probably never be used, but let's try to be generic)
        String[] words = content.split("\\s+");
        if (startAtWord == -1)
            startAtWord = 0;
        if (endAtWord == -1)
            endAtWord = words.length;
        StringBuilder b = new StringBuilder();
        for (int i = startAtWord; i < endAtWord; i++) {
            if (b.length() > 0)
                b.append(" ");
            b.append(words[i]);
        }
        return b.toString();
    }

    /**
     * Get the contents of a field from a Lucene Document.
     *
     * This takes into account that some fields are stored externally in content stores
     * instead of in the Lucene index.
     *
     * @param d
     *            the Document
     * @param fieldName
     *            the name of the field
     * @return the field content
     * @deprecated use version that takes a docId
     */
    @Deprecated
    public String getContent(Document d, String fieldName) {
        ContentAccessor ca = contentAccessors.get(fieldName);
        String content;
        if (ca == null) {
            // No special content accessor set; assume a stored field
            content = d.get(fieldName);
        } else {
            // Content accessor set. Use it to retrieve the content.
            content = ca.getSubstringFromDocument(d, -1, -1);
        }
        return content;
    }

    /**
     * Get the document contents (original XML).
     *
     * @param d
     *            the Document
     * @return the field content
     * @deprecated use version that takes a docId
     */
    @Deprecated
    public String getContent(Document d) {
        return getContent(d, mainContentsFieldName);
    }

    /**
     * Get the contents of a field from a Lucene Document.
     *
     * This takes into account that some fields are stored externally in content stores
     * instead of in the Lucene index.
     *
     * @param docId
     *            the Document id
     * @param fieldName
     *            the name of the field
     * @return the field content
     */
    public String getContent(int docId, String fieldName) {
        return getContent(docId, fieldName, -1, -1);
    }

    /**
     * Get the document contents (original XML).
     *
     * @param docId
     *            the Document id
     * @return the field content
     */
    public String getContent(int docId) {
        return getContent(docId, mainContentsFieldName, -1, -1);
    }

    /**
     * Get the Lucene index reader we're using.
     *
     * @return the Lucene index reader
     */
    public DirectoryReader getIndexReader() {
        return reader;
    }

    /**
     * Get a number of substrings from a certain field in a certain document.
     *
     * For larger documents, this is faster than retrieving the whole content first and then cutting
     * substrings from that.
     *
     * @param d
     *            the document
     * @param fieldName
     *            the field
     * @param starts
     *            start positions of the substring we want
     * @param ends
     *            end positions of the substring we want; correspond to the starts array.
     * @return the substrings
     */
    private String[] getSubstringsFromDocument(Document d, String fieldName, int[] starts, int[] ends) {
        ContentAccessor ca = contentAccessors.get(fieldName);
        String[] content;
        if (ca == null) {
            // No special content accessor set; assume a non-complex stored field
            // TODO: check with index structure?
            String luceneName = fieldName; // <- non-complex, so this works
            String fieldContent = d.get(luceneName);
            content = new String[starts.length];
            for (int i = 0; i < starts.length; i++) {
                content[i] = fieldContent.substring(starts[i], ends[i]);
            }
        } else {
            // Content accessor set. Use it to retrieve the content.
            content = ca.getSubstringsFromDocument(d, starts, ends);
        }
        return content;
    }

    /**
     * Highlight part of field content with the specified hits,
     * and make sure it's well-formed.
     *
     * Uses &lt;hl&gt;&lt;/hl&gt; tags to highlight the content.
     *
     * @param docId
     *            document to highlight a field from
     * @param fieldName
     *            field to highlight
     * @param hits
     *            the hits
     * @param startAtWord where to start highlighting (first word returned)
     * @param endAtWord where to end highlighting (first word not returned)
     * @return the highlighted content
     */
    public String highlightContent(int docId, String fieldName, Hits hits, int startAtWord, int endAtWord) {
        // Get the field content
        int endAtWordForCharPos = endAtWord < 0 ? endAtWord : endAtWord - 1; // if whole content, don't subtract one
        int[] startEndCharPos = startEndWordToCharPos(docId, fieldName, startAtWord, endAtWordForCharPos);
        int startAtChar = startEndCharPos[0];
        int endAtChar = startEndCharPos[1];
        String content = getContentByCharPos(docId, fieldName, startAtChar, endAtChar);

        if (hits == null && startAtWord == -1 && endAtWord == -1) {
            // No hits to highlight, and we've fetched the whole document, so it is
            // well-formed already. Just return as-is.
            return content;
        }

        // Find the character offsets for the hits and highlight
        List<HitCharSpan> hitspans = null;
        if (hits != null) // if hits == null, we still want the highlighter to make it well-formed
            hitspans = getCharacterOffsets(docId, fieldName, hits);
        XmlHighlighter hl = new XmlHighlighter();
        hl.setUnbalancedTagsStrategy(defaultUnbalancedTagsStrategy);
        if (startAtChar == -1)
            startAtChar = 0;
        return hl.highlight(content, hitspans, startAtChar);
    }

    /**
     * Highlight field content with the specified hits.
     *
     * Uses &lt;hl&gt;&lt;/hl&gt; tags to highlight the content.
     *
     * @param docId
     *            document to highlight a field from
     * @param fieldName
     *            field to highlight
     * @param hits
     *            the hits
     * @return the highlighted content
     */
    public String highlightContent(int docId, String fieldName, Hits hits) {
        return highlightContent(docId, fieldName, hits, -1, -1);
    }

    /**
     * Highlight field content with the specified hits.
     *
     * Uses &lt;hl&gt;&lt;/hl&gt; tags to highlight the content.
     *
     * @param docId
     *            document to highlight a field from
     * @param hits
     *            the hits
     * @return the highlighted content
     */
    public String highlightContent(int docId, Hits hits) {
        return highlightContent(docId, mainContentsFieldName, hits, -1, -1);
    }

    /**
     * Register a content accessor.
     *
     * This tells the Searcher how the content of different fields may be accessed. This is used for
     * making concordances, for example. Some fields are stored in the Lucene index, while others
     * may be stored on the file system, a database, etc.
     *
     * @param contentAccessor
     */
    private void registerContentAccessor(ContentAccessor contentAccessor) {
        contentAccessors.put(contentAccessor.getFieldName(), contentAccessor);
    }

    /**
     * Get the content store for a field name.
     *
     * @param fieldName the field name
     * @return the content store, or null if there is no content store for this field
     */
    public ContentStore getContentStore(String fieldName) {
        ContentAccessor ca = contentAccessors.get(fieldName);
        if (indexMode && ca == null) {
            // Index mode. Create new content store.
            ContentStore contentStore = new ContentStoreDirZip(new File(indexLocation, "cs_" + fieldName),
                    isEmptyIndex);
            registerContentStore(fieldName, contentStore);
            return contentStore;
        }
        if (ca instanceof ContentAccessorContentStore) {
            return ((ContentAccessorContentStore) ca).getContentStore();
        }
        return null;
    }

    /**
     * Register a ContentStore as a content accessor.
     *
     * This tells the Searcher how the content of different fields may be accessed. This is used for
     * making concordances, for example. Some fields are stored in the Lucene index, while others
     * may be stored on the file system, a database, etc.
     *
     * A ContentStore is a filesystem-based way to access the contents.
     *
     * @param fieldName
     *            the field for which this is the content accessor
     * @param contentStore
     *            the ContentStore object by which to access the content
     *
     */
    private void registerContentStore(String fieldName, ContentStore contentStore) {
        registerContentAccessor(new ContentAccessorContentStore(fieldName, contentStore));
    }

    /**
     * Test if a term occurs in the index
     *
     * @param term
     *            the term
     * @return true iff it occurs in the index
     * @deprecated moved to LuceneUtil
     */
    @Deprecated
    public boolean termOccursInIndex(Term term) {
        return LuceneUtil.termOccursInIndex(reader, term);
    }

    /**
     * Set the collator used for sorting.
     *
     * The default collator is for English.
     *
     * @param collator
     *            the collator
     */
    public void setCollator(Collator collator) {
        this.collator = collator;
    }

    /**
     * Get the collator being used for sorting.
     *
     * @return the collator
     */
    public Collator getCollator() {
        return collator;
    }

    /**
     * Opens all the forward indices, to avoid this delay later.
     *
     * NOTE: used to be public; now private because it's done automatically when
     * constructing the Searcher.
     */
    private void openForwardIndices() {
        for (String field : indexStructure.getComplexFields()) {
            ComplexFieldDesc fieldDesc = indexStructure.getComplexFieldDesc(field);
            for (String property : fieldDesc.getProperties()) {
                PropertyDesc propDesc = fieldDesc.getPropertyDesc(property);
                if (propDesc.hasForwardIndex()) {
                    // This property has a forward index. Make sure it is open.
                    getForwardIndex(ComplexFieldUtil.propertyField(field, property));
                }
            }
        }

        if (!indexMode || autoWarmForwardIndices) {
            final boolean callWarmup = autoWarmForwardIndices;
            // Start a background thread to build term indices and/or
            // warm up the forward indices
            autoWarmThread = new Thread(new Runnable() {
                @Override
                public void run() {
                    try {
                        buildAllTermIndices(); // speed up first call to Terms.indexOf()
                        if (callWarmup)
                            warmUpForwardIndices(); // speed up all forward index operations
                    } catch (InterruptedException e) {
                        // OK, just quit
                    }
                }
            });
            autoWarmThread.start();
        }
    }

    /**
     * "Warm up" the forward indices by performing a large number of reads on them,
     * getting them into disk cache.
     *
     * Not that this is done automatically in a background thread at startup, so you
     * shouldn't need to call this unless you've specifically switched this behaviour off.
     * @throws InterruptedException if the thread was interrupted during this operation
     * @deprecated use the external tool vmtouch, described here:
     *   https://github.com/INL/BlackLab/wiki/Improve-search-speed-using-the-disk-cache
     */
    @Deprecated
    public void warmUpForwardIndices() throws InterruptedException {
        logger.debug("Warming up " + forwardIndices.size() + " forward indices...");
        for (Map.Entry<String, ForwardIndex> e : forwardIndices.entrySet()) {
            e.getValue().warmUp();
            logger.debug("Forward index " + e.getKey() + " warmed up.");
        }
    }

    /**
     * Builds index for Terms.indexOf() method.
     *
     * This makes sure the first call to Terms.indexOf() in search mode will be fast.
     * Subsequent calls are always fast. (Terms.indexOf() is only used in search mode
     * by HitPropValue.deserialize(), so if you're not sure if you need to call this
     * method in your application, you probably don't.
     *
     * @deprecated called automatically now in search mode, no need to call it manually. This
     *   method will be made private eventually.
     */
    @Deprecated
    public void buildAllTermIndices() {
        for (Map.Entry<String, ForwardIndex> e : forwardIndices.entrySet()) {
            e.getValue().getTerms().buildTermIndex();
        }
    }

    /**
     * Tries to get the ForwardIndex object for the specified fieldname.
     *
     * Looks for an already-opened forward index first. If none is found, and if we're in
     * "create index" mode, may create a new forward index. Otherwise, looks for an existing forward
     * index and opens that.
     *
     * @param fieldPropName
     *            the field for which we want the forward index
     * @return the ForwardIndex if found/created, or null otherwise
     */
    public ForwardIndex getForwardIndex(String fieldPropName) {
        ForwardIndex forwardIndex = forwardIndices.get(fieldPropName);
        if (forwardIndex == null) {
            File dir = new File(indexLocation, "fi_" + fieldPropName);

            // Special case for old BL index with "forward" as the name of the single forward index
            // (this should be removed eventually)
            if (!isEmptyIndex && fieldPropName.equals(mainContentsFieldName) && !dir.exists()) {
                // Default forward index used to be called "forward". Look for that instead.
                File alt = new File(indexLocation, "forward");
                if (alt.exists())
                    dir = alt;
            }

            if (!isEmptyIndex && !dir.exists()) {
                // Forward index doesn't exist
                return null;
            }
            // Open forward index
            forwardIndex = ForwardIndex.open(dir, indexMode, collator, isEmptyIndex);
            forwardIndex.setIdTranslateInfo(reader, fieldPropName); // how to translate from
            // Lucene
            // doc to fiid
            forwardIndices.put(fieldPropName, forwardIndex);
        }
        return forwardIndex;
    }

    /**
     * Determine the concordance strings for a number of concordances, given the relevant character
     * positions.
     *
     * Every concordance requires four character positions: concordance start and end, and hit start
     * and end. Visualising it ('fox' is the hit word):
     *
     * [A] the quick brown [B] fox [C] jumps over the [D]
     *
     * The startsOfWords array contains the [A] and [B] positions for each concordance. The
     * endsOfWords array contains the [C] and [D] positions for each concordance.
     *
     * @param doc
     *            the Lucene document number
     * @param fieldName
     *            name of the field
     * @param startsOfWords
     *            the array of starts of words ([A] and [B] positions)
     * @param endsOfWords
     *            the array of ends of words ([C] and [D] positions)
     * @param hl
     * @return the list of concordances
     */
    List<Concordance> makeConcordancesFromContentStore(int doc, String fieldName, int[] startsOfWords,
            int[] endsOfWords, XmlHighlighter hl) {
        // Determine starts and ends
        int n = startsOfWords.length / 2;
        int[] starts = new int[n];
        int[] ends = new int[n];
        for (int i = 0, j = 0; i < startsOfWords.length; i += 2, j++) {
            starts[j] = startsOfWords[i];
            ends[j] = endsOfWords[i + 1];
        }

        // Retrieve 'em all
        Document d = document(doc);
        String[] content = getSubstringsFromDocument(d, fieldName, starts, ends);

        // Cut 'em up
        List<Concordance> rv = new ArrayList<Concordance>();
        for (int i = 0, j = 0; i < startsOfWords.length; i += 2, j++) {
            // Put the concordance in the Hit object
            int absLeft = startsOfWords[i];
            int absRight = endsOfWords[i + 1];
            int relHitLeft = startsOfWords[i + 1] - absLeft;
            int relHitRight = endsOfWords[i] - absLeft;
            String currentContent = content[j];

            // Determine context and build concordance.
            // Note that hit text may be empty for hits of length zero,
            // such as a search for open tags (which have a location but zero length,
            // like a search for a word has a length 1)
            String hitText = relHitRight < relHitLeft ? "" : currentContent.substring(relHitLeft, relHitRight);
            String leftContext = currentContent.substring(0, relHitLeft);
            String rightContext = currentContent.substring(relHitRight, absRight - absLeft);

            // Make each fragment well-formed
            hitText = hl.makeWellFormed(hitText);
            leftContext = hl.makeWellFormed(leftContext);
            rightContext = hl.makeWellFormed(rightContext);

            rv.add(new Concordance(new String[] { leftContext, hitText, rightContext }));
        }
        return rv;
    }

    /**
     * Indicate how to use the forward indices to build concordances.
     *
     * Call this method to set the default for hit sets; call the method in Hits
     * to change it for a single hit set.
     *
     * @param wordFI FI to use as the text content of the &lt;w/&gt; tags (default "word"; null for no text content)
     * @param punctFI FI to use as the text content between &lt;w/&gt; tags (default "punct"; null for just a space)
     * @param attrFI FIs to use as the attributes of the &lt;w/&gt; tags (null for all other FIs)
     * @deprecated renamed to setConcordanceXmlProperties
     */
    @Deprecated
    public void setForwardIndexConcordanceParameters(String wordFI, String punctFI, Collection<String> attrFI) {
        setConcordanceXmlProperties(wordFI, punctFI, attrFI);
    }

    /**
     * Indicate how to use the forward indices to build concordances.
     *
     * Only applies if you're building concordances from the forward index.
     *
     * Call this method to set the default for hit sets; call the method in Hits
     * to change it for a single hit set.
     *
     * @param wordFI FI to use as the text content of the &lt;w/&gt; tags (default "word"; null for no text content)
     * @param punctFI FI to use as the text content between &lt;w/&gt; tags (default "punct"; null for just a space)
     * @param attrFI FIs to use as the attributes of the &lt;w/&gt; tags (null for all other FIs)
     */
    public void setConcordanceXmlProperties(String wordFI, String punctFI, Collection<String> attrFI) {
        concWordFI = wordFI;
        concPunctFI = punctFI;
        concAttrFI = attrFI;
    }

    /**
     * Get the default context size used for building concordances
     *
     * @return the context size
     */
    public int getDefaultContextSize() {
        return defaultContextSize;
    }

    /**
     * Set the default context size to use for building concordances
     *
     * @param defaultContextSize
     *            the context size
     */
    public void setDefaultContextSize(int defaultContextSize) {
        this.defaultContextSize = defaultContextSize;
    }

    /**
     * Factory method to create a directory content store.
     *
     * @param indexXmlDir
     *            the content store directory
     * @param create if true, create a new content store even if one exists
     * @return the content store
     * @deprecated renamed to openContentStore()
     */
    @Deprecated
    public ContentStore getContentStoreDir(File indexXmlDir, boolean create) {
        return openContentStore(indexXmlDir, create);
    }

    /**
     * Factory method to create a directory content store.
     *
     * @param indexXmlDir
     *            the content store directory
     * @param create if true, create a new content store even if one exists
     * @return the content store
     */
    public ContentStore openContentStore(File indexXmlDir, boolean create) {
        String type;
        if (create)
            type = "utf8zip";
        else {
            VersionFile vf = ContentStoreDirAbstract.getStoreTypeVersion(indexXmlDir);
            type = vf.getType();
        }
        if (type.equals("utf8zip"))
            return new ContentStoreDirZip(indexXmlDir, create);
        if (type.equals("utf8"))
            return new ContentStoreDirUtf8(indexXmlDir, create);
        if (type.equals("utf16"))
            return new ContentStoreDir(indexXmlDir, create);
        throw new RuntimeException("Unknown content store type " + type);
    }

    /**
     * Factory method to create a directory content store.
     *
     * @param indexXmlDir
     *            the content store directory
     * @return the content store
     */
    public ContentStore openContentStore(File indexXmlDir) {
        return openContentStore(indexXmlDir, false);
    }

    /**
     * Get the Terms object for the specified field.
     *
     * The Terms object is part of the ForwardIndex module and provides a mapping from term id to
     * term String, and between term id and term sort position. It is used while sorting and
     * grouping hits (by mapping the context term ids to term sort position ids), and later used to
     * display the group name (by mapping the sort position ids back to Strings)
     *
     * @param fieldPropName
     *            the field for which we want the Terms object
     * @return the Terms object
     * @throws RuntimeException
     *             if this field does not have a forward index, and hence no Terms object.
     */
    public Terms getTerms(String fieldPropName) {
        ForwardIndex forwardIndex = getForwardIndex(fieldPropName);
        if (forwardIndex == null) {
            throw new RuntimeException("Field " + fieldPropName + " has no forward index!");
        }
        return forwardIndex.getTerms();
    }

    /**
     * Get the Terms object for the main contents field.
     *
     * The Terms object is part of the ForwardIndex module and provides a mapping from term id to
     * term String, and between term id and term sort position. It is used while sorting and
     * grouping hits (by mapping the context term ids to term sort position ids), and later used to
     * display the group name (by mapping the sort position ids back to Strings)
     *
     * @return the Terms object
     * @throws RuntimeException
     *             if this field does not have a forward index, and hence no Terms object.
     */
    public Terms getTerms() {
        return getTerms(ComplexFieldUtil.mainPropertyField(indexStructure, mainContentsFieldName));
    }

    public String getContentsFieldMainPropName() {
        return mainContentsFieldName;
    }

    public boolean isDefaultSearchCaseSensitive() {
        return defaultCaseSensitive;
    }

    public boolean isDefaultSearchDiacriticsSensitive() {
        return defaultDiacriticsSensitive;
    }

    public void setDefaultSearchSensitive(boolean b) {
        defaultCaseSensitive = defaultDiacriticsSensitive = b;
    }

    public void setDefaultSearchSensitive(boolean caseSensitive, boolean diacriticsSensitive) {
        defaultCaseSensitive = caseSensitive;
        defaultDiacriticsSensitive = diacriticsSensitive;
    }

    /**
     * Get the default initial query execution context.
     *
     * @param fieldName
     *            the field to search
     * @return the query execution context
     */
    public QueryExecutionContext getDefaultExecutionContext(String fieldName) {
        ComplexFieldDesc complexFieldDesc = indexStructure.getComplexFieldDesc(fieldName);
        if (complexFieldDesc == null)
            throw new RuntimeException("Unknown complex field " + fieldName);
        PropertyDesc mainProperty = complexFieldDesc.getMainProperty();
        if (mainProperty == null)
            throw new RuntimeException("Main property not found for " + fieldName);
        String mainPropName = mainProperty.getName();
        return new QueryExecutionContext(this, fieldName, mainPropName, defaultCaseSensitive,
                defaultDiacriticsSensitive);
    }

    /**
     * Get the default initial query execution context.
     *
     * Uses the default contents field.
     *
     * @return the query execution context
     */
    public QueryExecutionContext getDefaultExecutionContext() {
        return getDefaultExecutionContext(mainContentsFieldName);
    }

    public String getIndexName() {
        return indexLocation.toString();
    }

    public IndexWriter openIndexWriter(File indexDir, boolean create)
            throws IOException, CorruptIndexException, LockObtainFailedException {
        if (!indexDir.exists() && create) {
            indexDir.mkdir();
        }
        Directory indexLuceneDir = FSDirectory.open(indexDir);
        BLDutchAnalyzer defaultAnalyzer = new BLDutchAnalyzer(); // NOTE: not actually used, we override this
        IndexWriterConfig config = Utilities.getIndexWriterConfig(defaultAnalyzer, create);
        IndexWriter writer = new IndexWriter(indexLuceneDir, config);

        if (create)
            VersionFile.write(indexDir, "blacklab", "2");
        else {
            if (!isIndex(indexDir)) {
                throw new RuntimeException(
                        "BlackLab index has wrong type or version! " + VersionFile.report(indexDir));
            }
        }

        return writer;
    }

    public static Collator getDefaultCollator() {
        return defaultCollator;
    }

    public static void setDefaultCollator(Collator defaultCollator) {
        Searcher.defaultCollator = defaultCollator;
    }

    /** Set whether or not to automatically warm up the forward indices in a background thread in Searcher constructor
     * @param b if true, automatically warm up forward indices in Searcher constructor */
    public static void setAutoWarmForwardIndices(boolean b) {
        autoWarmForwardIndices = b;
    }

    public IndexWriter getWriter() {
        return indexWriter;
    }

    public File getIndexDirectory() {
        return indexLocation;
    }

    /** Deletes documents matching a query from the BlackLab index.
     *
     * This deletes the documents from the Lucene index, the forward indices and the content store(s).
     * @param q the query
     */
    public void delete(Query q) {
        if (!indexMode)
            throw new RuntimeException("Cannot delete documents, not in index mode");
        try {
            // Open a fresh reader to execute the query
            DirectoryReader reader = DirectoryReader.open(indexWriter, false);
            try {
                // Execute the query, iterate over the docs and delete from FI and CS.
                IndexSearcher s = new IndexSearcher(reader);
                Weight w = s.createNormalizedWeight(q);
                AtomicReader scrw = new SlowCompositeReaderWrapper(reader);
                try {
                    Scorer sc = w.scorer(scrw.getContext(), true, false, MultiFields.getLiveDocs(reader));
                    if (sc == null)
                        return; // no matching documents

                    // Iterate over matching docs
                    while (true) {
                        int docId;
                        try {
                            docId = sc.nextDoc();
                        } catch (IOException e) {
                            throw new RuntimeException(e);
                        }
                        if (docId == DocIdSetIterator.NO_MORE_DOCS)
                            break;
                        Document d = reader.document(docId);

                        // Delete this document in all forward indices
                        for (Map.Entry<String, ForwardIndex> e : forwardIndices.entrySet()) {
                            String fieldName = e.getKey();
                            ForwardIndex fi = e.getValue();
                            int fiid = Integer.parseInt(d.get(ComplexFieldUtil.forwardIndexIdField(fieldName)));
                            fi.deleteDocument(fiid);
                        }

                        // Delete this document in all content stores
                        for (Map.Entry<String, ContentAccessor> e : contentAccessors.entrySet()) {
                            String fieldName = e.getKey();
                            ContentAccessor ca = e.getValue();
                            if (!(ca instanceof ContentAccessorContentStore))
                                continue; // can only delete from content store
                            ContentStore cs = ((ContentAccessorContentStore) ca).getContentStore();
                            int cid = Integer.parseInt(d.get(ComplexFieldUtil.contentIdField((fieldName))));
                            cs.delete(cid);
                        }
                    }
                } finally {
                    scrw.close();
                }
            } finally {
                reader.close();
            }

            // Finally, delete the documents from the Lucene index
            indexWriter.deleteDocuments(q);

        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Get the analyzer for indexing and searching.
     * @return the analyzer
     */
    public Analyzer getAnalyzer() {
        return analyzer;
    }

    /**
     * Get the analyzer to use for indexing.
     * (strips things like wildcards, etc.)
     * @return the analyzer
     * @deprecated use getAnalyzer() (we can use the same analyzer for indexing and searching after all because wildcard queries are never analyzed)
     */
    @Deprecated
    public Analyzer getIndexAnalyzer() {
        return getAnalyzer();
    }

    /**
     * Get the analyzer to use for parsing document filters while searching.
     * (leaves wildcards alone)
     * @return the analyzer
     * @deprecated use getAnalyzer() (we can use the same analyzer for indexing and searching after all because wildcard queries are never analyzed)
     */
    @Deprecated
    public Analyzer getSearchAnalyzer() {
        return getAnalyzer();
    }

    /**
     * Perform a document query only (no hits)
     * @param documentFilterQuery the document-level query
     * @return the matching documents
     */
    @SuppressWarnings("deprecation") // DocResults constructor will be made package-private eventually
    public DocResults queryDocuments(Query documentFilterQuery) {
        return new DocResults(this, documentFilterQuery);
    }

    /**
     * Determine the term frequencies in a set of documents (defined by the filter query)
     *
     * @param documentFilterQuery what set of documents to get the term frequencies for
     * @param fieldName complex field name, i.e. contents
     * @param propName property name, i.e. word, lemma, pos, etc.
     * @param altName alternative name, i.e. s, i (case-sensitivity)
     * @return the term frequency map
     */
    public Map<String, Integer> termFrequencies(Query documentFilterQuery, String fieldName, String propName,
            String altName) {
        try {
            String luceneField = ComplexFieldUtil.propertyField(fieldName, propName, altName);
            Weight weight = indexSearcher.createNormalizedWeight(documentFilterQuery);
            Map<String, Integer> freq = new HashMap<String, Integer>();
            for (AtomicReaderContext arc : reader.leaves()) {
                if (weight == null)
                    throw new RuntimeException("weight == null");
                if (arc == null)
                    throw new RuntimeException("arc == null");
                if (arc.reader() == null)
                    throw new RuntimeException("arc.reader() == null");
                Scorer scorer = weight.scorer(arc, true, false, arc.reader().getLiveDocs());
                if (scorer != null) {
                    while (scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        LuceneUtil.getFrequenciesFromTermVector(reader, scorer.docID() + arc.docBase, luceneField,
                                freq);
                    }
                }
            }
            return freq;
        } catch (IOException e) {
            throw ExUtil.wrapRuntimeException(e);
        }
    }

    /**
     * Perform a document query and collect the results through a Collector.
     * @param query query to execute
     * @param collector object that receives each document hit
     */
    public void collectDocuments(Query query, Collector collector) {
        try {
            indexSearcher.search(query, collector);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Return the list of terms that occur in a field.
     *
     * @param fieldName the field
     * @param maxResults maximum number to return (or -1 for no limit)
     * @return the matching terms
     */
    public List<String> getFieldTerms(String fieldName, int maxResults) {
        try {
            AtomicReader srw = new SlowCompositeReaderWrapper(reader);
            return LuceneUtil.getFieldTerms(srw, fieldName, maxResults);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Return a timestamp for when BlackLab was built.
     *
     * @return build timestamp (format: yyyy-MM-dd HH:mm:ss), or UNKNOWN if
     *   the timestamp could not be found for some reason (i.e. not running from a
     *   JAR, or JAR was not created with the Ant buildscript).
     */
    public static String getBlackLabBuildTime() {
        try {
            URL res = Searcher.class.getResource(Searcher.class.getSimpleName() + ".class");
            URLConnection conn = res.openConnection();
            if (!(conn instanceof JarURLConnection)) {
                // Not running from a JAR, no manifest to read
                return "UNKNOWN";
            }
            JarURLConnection jarConn = (JarURLConnection) res.openConnection();
            Manifest mf = jarConn.getManifest();
            Attributes atts = mf.getMainAttributes();
            String value = atts.getValue("Build-Date");
            return value == null ? "UNKNOWN" : value;
        } catch (IOException e) {
            throw new RuntimeException("Could not read build date from manifest", e);
        }
    }

    /**
     * Instantiate analyzer based on an analyzer alias.
     *
     * @param analyzerName the classname, optionally preceded by the package name
     * @return the analyzer, or null if the name wasn't recognized
     */
    static Analyzer getAnalyzerInstance(String analyzerName) {
        analyzerName = analyzerName.toLowerCase();
        if (analyzerName.equals("whitespace")) {
            return whitespaceAnalyzer;
        } else if (analyzerName.equals("default")) {
            return defaultAnalyzer;
        } else if (analyzerName.equals("standard")) {
            return standardAnalyzer;
        } else if (analyzerName.matches("(non|un)tokeniz(ing|ed)")) {
            return nonTokenizingAnalyzer;
        }
        return null;
    }

    public String getMainContentsFieldName() {
        return mainContentsFieldName;
    }

    public String getConcWordFI() {
        return concWordFI;
    }

    public String getConcPunctFI() {
        return concPunctFI;
    }

    public Collection<String> getConcAttrFI() {
        return concAttrFI;
    }

    public Map<String, ForwardIndex> getForwardIndices() {
        return forwardIndices;
    }

    public IndexSearcher getIndexSearcher() {
        return indexSearcher;
    }

}