com.bigdata.search.FullTextIndex.java Source code

Introduction

Here is the source code for com.bigdata.search.FullTextIndex.java
Source

/*
    
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.
    
Contact:
 SYSTAP, LLC DBA Blazegraph
 2501 Calvert ST NW #106
 Washington, DC 20008
 licenses@blazegraph.com
    
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    
*/
/*
 * Created on Jan 23, 2008
 */

package com.bigdata.search;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexTypeEnum;
import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.btree.raba.codec.EmptyRabaValueCoder;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.IResourceLock;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.rdf.lexicon.ITextIndexer.FullTextQuery;
import com.bigdata.relation.AbstractRelation;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.concurrent.ExecutionHelper;

/**
 * Full text indexing and search support.
 * <p>
 * The basic data model consists of documents, fields in documents, and tokens
 * extracted by an analyzer from those fields.
 * <p>
 * The frequency distributions may be normalized to account for a variety of
 * effects producing "term weights". For example, normalizing for document
 * length or relative frequency of a term in the overall collection. Therefore
 * the logical model is:
 * 
 * <pre>
 * 
 *             token : {docId, freq?, weight?}+
 * 
 * </pre>
 * 
 * (For RDF, docId is the term identifier as assigned by the term:id index.)
 * <p>
 * The freq and weight are optional values that are representative of the kinds
 * of statistical data that are kept on a per-token-document basis. The freq is
 * the token frequency (the frequency of occurrence of the token in the
 * document). The weight is generally a normalized token frequency weight for
 * the token in that document in the context of the overall collection.
 * <p>
 * In fact, we actually represent the data as follows:
 * 
 * <pre>
 * 
 *             {sortKey(token), weight, docId, fldId} : {freq?, sorted(pos)+}
 * 
 * </pre>
 * 
 * That is, there is a distinct entry in the full text B+Tree for each field in
 * each document in which a given token was recognized. The text of the token is
 * not stored in the key, just the Unicode sort key generated from the token
 * text. The value associated with the B+Tree entry is optional - it is simply
 * not used unless we are storing statistics for the token-document pair. The
 * advantages of this approach are: (a) it reuses the existing B+Tree data
 * structures efficiently; (b) we are never faced with the possibility overflow
 * when a token is used in a large number of documents. The entries for the
 * token will simply be spread across several leaves in the B+Tree; (c) leading
 * key compression makes the resulting B+Tree very efficient; and (d) in a
 * scale-out range partitioned index we can load balance the resulting index
 * partitions by choosing the partition based on an even token boundary.
 * <p>
 * A field is any pre-identified text container within a document. Field
 * identifiers are integers, so there are <code>32^2</code> distinct possible
 * field identifiers. It is possible to manage the field identifiers through a
 * secondary index, but that has no direct bearing on the structure of the full
 * text index itself. Field identifies appear after the token in the key so that
 * queries may be expressed that will be matched against any field in the
 * document. Likewise, field identifiers occur before the document identifier in
 * the key since we always search across documents (in a search key, the
 * document identifier is always {@link Long#MIN_VALUE} and the field identifier
 * is always {@link Integer#MIN_VALUE}). There are many applications for fields:
 * for example, distinct fields may be used for the title, abstract, and full
 * text of a document or for the CDATA section of each distinct element in
 * documents corresponding to some DTD. The application is responsible for
 * recognizing the fields in the document and producing the appropriate token
 * stream, each of which must be tagged by the field.
 * <p>
 * A query is tokenized, producing a (possibly normalized) token-frequency
 * vector. The relevance of documents to the query is generally taken as the
 * cosine between the query's and each document's (possibly normalized)
 * token-frequency vectors. The main effort of search is assembling a token
 * frequency vector for just those documents with which there is an overlap with
 * the query. This is done using a key range scan for each token in the query
 * against the full text index.
 * 
 * <pre>
 *             fromKey := token, Long.MIN_VALUE
 *             toKey   := successor(token), Long.MIN_VALUE
 * </pre>
 * 
 * and extracting the appropriate token frequency, normalized token weight, or
 * other statistic. When no value is associated with the entry we follow the
 * convention of assuming a token frequency of ONE (1) for each document in
 * which the token appears.
 * <p>
 * Tokenization is informed by the language code (when declared) and by the
 * configured {@link Locale} for the database otherwise. An appropriate
 * {@link Analyzer} is chosen based on the language code or {@link Locale} and
 * the "document" is broken into a token-frequency distribution (alternatively a
 * set of tokens). The same process is used to tokenize queries, and the API
 * allows the caller to specify the language code used to select the
 * {@link Analyzer} to tokenize the query.
 * <p>
 * Once the tokens are formed the language code / {@link Locale} used to produce
 * the token is discarded (it is not represented in the index). The reason for
 * this is that we never utilize the total ordering of the full text index,
 * merely the manner in which it groups tokens that map onto the same Unicode
 * sort key together. Further, we use only a single Unicode collator
 * configuration regardless of the language family in which the token was
 * originally expressed. Unlike the collator used by the terms index (which
 * often is set at IDENTICAL strength), the collector used by the full text
 * index should be chosen such that it makes relatively few distinctions in
 * order to increase recall (e.g., set at PRIMARY strength). Since a total order
 * over the full text index is not critical from the perspective of its IR
 * application, the {@link Locale} for the collator is likewise not critical and
 * PRIMARY strength will produce significantly shorter Unicode sort keys.
 * <p>
 * The term frequency within that literal is an optional property associated
 * with each term identifier, as is the computed weight for the token in the
 * term.
 * <p>
 * Note: Documents should be tokenized using an {@link Analyzer} appropriate for
 * their declared language code (if any). However, once tokenized, the language
 * code is discarded and we perform search purely on the Unicode sort keys
 * resulting from the extracted tokens.
 * <h2>Scale-out</h2>
 * <p>
 * Because the first component in the key is the token, both updates (when
 * indexing document) and queries (reading against different tokens) will be
 * scattered across shards. Therefore it is not necessary to register a split
 * handler for the full text index.
 * 
 * @todo The key for the terms index is {term,docId,fieldId}. Since the data are
 *       not pre-aggregated by {docId,fieldId} we can not easily remove only
 *       those tuples corresponding to some document (or some field of some
 *       document).
 *       <p>
 *       In order to removal of the fields for a document we need to know either
 *       which fields were indexed for the document and the tokens found in
 *       those fields and then scatter the removal request (additional space
 *       requirements) or we need to flood a delete procedure across the terms
 *       index (expensive).
 * 
 * @todo provide M/R alternatives for indexing or computing/updating global
 *       weights.
 * 
 * @todo Consider model in which fields are declared and then a "Document" is
 *       indexed. This lets us encapsulate the "driver" for indexing. The
 *       "field" can be a String or a Reader, etc.
 *       <p>
 *       Note that lucene handles declaration of the data that will be stored
 *       for a field on a per Document basis {none, character offsets, character
 *       offsets + token positions}. There is also an option to store the term
 *       vector itself. Finally, there are options to store, compress+store, or
 *       not store the field value. You can also choose {None, IndexTokenized,
 *       IndexUntokenized} and an option dealing with norms.
 * 
 * @todo lucene {@link Analyzer}s may be problematic. For example, it is
 *       difficult to tokenize numbers. consider replacing the lucene
 *       analyzer/tokenizer with our own stuff. this might help with
 *       tokenization of numbers, etc. and with tokenization of native html or
 *       xml with intact offsets.
 * 
 * @todo lucene analyzers will strip stopwords by default. There should be a
 *       configuration option to strip out stopwords and another to enable
 *       stemming. how we do that should depend on the language family.
 *       Likewise, there should be support for language family specific stopword
 *       lists and language family specific exclusions.
 * 
 * @todo support more term weighting schemes and make them easy to configure.
 * 
 * @param <V>
 *            The generic type of the document identifier.
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class FullTextIndex<V extends Comparable<V>> extends AbstractRelation {

    final private static transient Logger log = Logger.getLogger(FullTextIndex.class);

    /**
     * The backing index.
     */
    volatile private IIndex ndx;

    /**
     * The index used to associate term identifiers with tokens parsed from
     * documents.
     */
    public IIndex getIndex() {

        if (ndx == null) {

            synchronized (this) {

                ndx = getIndex(getNamespace() + "." + NAME_SEARCH);

                if (ndx == null)
                    throw new IllegalStateException();

            }

        }

        return ndx;

    }

    /**
     * Options understood by the {@link FullTextIndex}.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    public interface Options {

        /**
         * <code>indexer.overwrite</code> - boolean option (default
         * <code>true</code>) controls the behavior when a write is requested
         * on the index and the {term,doc,field} tuple which forms the key is
         * already present in the index. When <code>true</code>, the new
         * value will be written on the index. When <code>false</code>, the
         * existing value will be retained. This option is an optimization which
         * makes sense when the corpus (a) only grows; and (b) the content of
         * the documents in the corpus never changes. For example, this is true
         * for an RDF database since the set of terms only grows and each term
         * is immutable.
         */
        String OVERWRITE = FullTextIndex.class.getName() + ".overwrite";

        String DEFAULT_OVERWRITE = "true";

        /**
         * Specify the collator {@link StrengthEnum strength} for the full-text
         * index (default {@value StrengthEnum#Primary}).
         * <p>
         * Note: {@link StrengthEnum#Primary} is generally what you want for a
         * full text index as search will consider tokens which differ in case
         * and other subtle features to be the same token (a 'match').
         * 
         * @see KeyBuilder.Options#STRENGTH
         */
        String INDEXER_COLLATOR_STRENGTH = FullTextIndex.class.getName() + ".collator.strength";

        String DEFAULT_INDEXER_COLLATOR_STRENGTH = StrengthEnum.Primary.toString();

        /**
         * The maximum time in milliseconds that the search engine will await
         * completion of the tasks reading on each of the query terms (default
         * {@value #DEFAULT_INDEXER_TIMEOUT}). A value of ZERO (0) means NO
         * timeout and is equivalent to a value of {@link Long#MAX_VALUE}. If
         * the timeout expires before all tasks complete then the search results
         * will only reflect partial information.
         */
        String INDEXER_TIMEOUT = FullTextIndex.class.getName() + ".timeout";

        String DEFAULT_INDEXER_TIMEOUT = "0";

        /**
         * When <code>true</code>, the <code>fieldId</code> is stored as part of
         * the key (default {@value #DEFAULT_FIELDS_ENABLED}). When
         * <code>false</code>, each key will be four bytes shorter. Applications
         * which do not use <code>fieldId</code> are should disable it when
         * creating the {@link FullTextIndex}.
         */
        String FIELDS_ENABLED = FullTextIndex.class.getName() + ".fieldsEnabled";

        String DEFAULT_FIELDS_ENABLED = "false";

        //        /**
        //         * When <code>true</code>, the <code>localTermWeight</code> is stored
        //         * using double-precision. When <code>false</code>, it is stored using
        //         * single-precision.
        //         */
        //        String DOUBLE_PRECISION = FullTextIndex.class.getName()
        //                + ".doublePrecision";
        //
        //        String DEFAULT_DOUBLE_PRECISION = "false";

        /**
         * The name of the {@link IAnalyzerFactory} class which will be used to
         * obtain analyzers when tokenizing documents and queries (default
         * {@value #DEFAULT_ANALYZER_FACTORY_CLASS}).  The specified class MUST
         * implement {@link IAnalyzerFactory} and MUST have a constructor with
         * the following signature:
         * <pre>
         * public MyAnalyzerFactory(FullTextIndexer indexer)
         * </pre>
         */
        String ANALYZER_FACTORY_CLASS = FullTextIndex.class.getName() + ".analyzerFactoryClass";

        String DEFAULT_ANALYZER_FACTORY_CLASS = DefaultAnalyzerFactory.class.getName();

        /**
        * We keep a small hit cache based on search parameters: search string +
        * prefixMatch + matchAllTerms.  This defines the size of that cache.
        * The value should remain small.
        */
        String HIT_CACHE_SIZE = FullTextIndex.class.getName() + ".hitCacheSize";

        String DEFAULT_HIT_CACHE_SIZE = "10";

        /**
         * We keep a small hit cache based on search parameters: search string +
         * prefixMatch + matchAllTerms. This defines the timeout for values in
         * that cache (in milliseconds). The value should remain small.
         */
        String HIT_CACHE_TIMEOUT_MILLIS = FullTextIndex.class.getName() + ".hitCacheTimeoutMillis";

        /**
         * Default is 1 minute.
         */
        String DEFAULT_HIT_CACHE_TIMEOUT_MILLIS = String.valueOf(TimeUnit.MINUTES.toMillis(1));

    }

    /**
     * @see Options#OVERWRITE
     */
    private final boolean overwrite;

    /**
     * Return the value configured by the {@link Options#OVERWRITE} property.
     */
    public boolean isOverwrite() {

        return overwrite;

    }

    /**
     * @see Options#INDEXER_TIMEOUT
     */
    private final long timeout;

    //    /**
    //     * @see Options#FIELDS_ENABLED
    //     */
    //    private final boolean fieldsEnabled;
    //
    //    /**
    //     * @see Options#DOUBLE_PRECISION
    //     */
    //    private final boolean doublePrecision;
    //
    //    /**
    //     * Return the value configured by the {@link Options#FIELDS_ENABLED}
    //     * property.
    //     */
    //    public boolean isFieldsEnabled() {
    //        
    //        return fieldsEnabled;
    //        
    //    }

    /**
     * @see Options#ANALYZER_FACTORY_CLASS
     */
    private final IAnalyzerFactory analyzerFactory;

    /**
     * See {@link Options#HIT_CACHE_SIZE}.
     */
    private final int hitCacheSize;

    /**
     * See {@link Options#HIT_CACHE_TIMEOUT_MILLIS}.
     */
    private final long hitCacheTimeoutMillis;

    /**
     * See {@link Options#HIT_CACHE_SIZE}.
     */
    private final ConcurrentWeakValueCacheWithTimeout<FullTextQuery, Hit<V>[]> cache;

    //    /**
    //     * @see Options#DOCID_FACTORY_CLASS
    //     */
    //    private final IKeyBuilderExtension<V> docIdFactory;

    //    /**
    //     * The concrete {@link IRecordBuilder} instance.
    //     */
    //    private final IRecordBuilder<V> recordBuilder;
    //    
    //    /**
    //     * Return the object responsible for encoding and decoding the tuples
    //     * in the full text index.
    //     */
    //    public final IRecordBuilder<V> getRecordBuilder() {
    //        
    //        return recordBuilder;
    //        
    //    }

    /**
     * The basename of the search index.
     */
    public static final transient String NAME_SEARCH = "search";

    /**
     * <code>true</code> unless {{@link #getTimestamp()} is {@link ITx#UNISOLATED}.
     */
    final public boolean isReadOnly() {

        return TimestampUtility.isReadOnly(getTimestamp());

    }

    //    protected void assertWritable() {
    //        
    //        if(isReadOnly()) {
    //            
    //            throw new IllegalStateException("READ_ONLY");
    //            
    //        }
    //        
    //    }

    /**
     * Ctor specified by {@link DefaultResourceLocator}.
     * 
     * @param client
     *            The client. Configuration information is obtained from the
     *            client. See {@link Options}.
     * 
     * @see Options
     */
    public FullTextIndex(final IIndexManager indexManager, final String namespace, final Long timestamp,
            final Properties properties) {

        super(indexManager, namespace, timestamp, properties);

        {

            overwrite = Boolean.parseBoolean(properties.getProperty(Options.OVERWRITE, Options.DEFAULT_OVERWRITE));

            if (log.isInfoEnabled())
                log.info(Options.OVERWRITE + "=" + overwrite);

        }

        {

            timeout = Long
                    .parseLong(properties.getProperty(Options.INDEXER_TIMEOUT, Options.DEFAULT_INDEXER_TIMEOUT));

            if (log.isInfoEnabled())
                log.info(Options.INDEXER_TIMEOUT + "=" + timeout);

        }

        //        {
        //
        //            fieldsEnabled = Boolean.parseBoolean(properties.getProperty(
        //                    Options.FIELDS_ENABLED, Options.DEFAULT_FIELDS_ENABLED));
        //
        //            if (log.isInfoEnabled())
        //                log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled);
        //
        //        }
        //
        //        {
        //
        //            doublePrecision = Boolean
        //                    .parseBoolean(properties.getProperty(
        //                            Options.DOUBLE_PRECISION,
        //                            Options.DEFAULT_DOUBLE_PRECISION));
        //
        //            if (log.isInfoEnabled())
        //                log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision);
        //
        //        }

        {

            hitCacheSize = Integer
                    .parseInt(properties.getProperty(Options.HIT_CACHE_SIZE, Options.DEFAULT_HIT_CACHE_SIZE));

            if (log.isInfoEnabled())
                log.info(Options.HIT_CACHE_SIZE + "=" + hitCacheSize);

        }

        {

            hitCacheTimeoutMillis = Long.parseLong(properties.getProperty(Options.HIT_CACHE_TIMEOUT_MILLIS,
                    Options.DEFAULT_HIT_CACHE_TIMEOUT_MILLIS));

            if (log.isInfoEnabled())
                log.info(Options.HIT_CACHE_TIMEOUT_MILLIS + "=" + hitCacheTimeoutMillis);

        }

        this.cache = new ConcurrentWeakValueCacheWithTimeout<FullTextQuery, Hit<V>[]>(hitCacheSize,
                hitCacheTimeoutMillis);

        {

            final String className = getProperty(Options.ANALYZER_FACTORY_CLASS,
                    Options.DEFAULT_ANALYZER_FACTORY_CLASS);

            if (log.isInfoEnabled())
                log.info(Options.ANALYZER_FACTORY_CLASS + "=" + className);

            final Class<IAnalyzerFactory> cls;
            try {
                cls = (Class<IAnalyzerFactory>) Class.forName(className);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Bad option: " + Options.ANALYZER_FACTORY_CLASS, e);
            }

            if (!IAnalyzerFactory.class.isAssignableFrom(cls)) {
                throw new RuntimeException(
                        Options.ANALYZER_FACTORY_CLASS + ": Must extend: " + IAnalyzerFactory.class.getName());
            }

            try {

                final Constructor<? extends IAnalyzerFactory> ctor = cls
                        .getConstructor(new Class[] { FullTextIndex.class });

                // save reference.
                analyzerFactory = ctor.newInstance(new Object[] { this });

            } catch (Exception ex) {

                throw new RuntimeException(ex);

            }

        }

        /*
         * Note: defer resolution of the index.
         */
        //        // resolve index (might not exist, in which case this will be null).
        //        ndx = getIndex(getNamespace()+"."+NAME_SEARCH);

    }

    /**
     * Conditionally registers the necessary index(s).
     * 
     * @throws IllegalStateException
     *             if the client does not have write access.
     * 
     * @todo this is not using {@link #acquireExclusiveLock()} since I generally
     *       allocate the text index inside of another relation and
     *       {@link #acquireExclusiveLock()} is not reentrant for zookeeper.
     */
    /*
     * Note: BigdataRDFFullTextIndex overrides this method to setup IV support.
     */
    @Override
    public void create() {

        assertWritable();

        final String name = getNamespace() + "." + NAME_SEARCH;

        final IIndexManager indexManager = getIndexManager();

        //        final IResourceLock resourceLock = acquireExclusiveLock();
        //
        //        try {

        /*
         * Register a tuple serializer that knows how to unpack the values and
         * how to extract the bytes corresponding to the encoded text (they can
         * not be decoded) from key and how to extract the document and field
         * identifiers from the key.
         */
        final Properties p = getProperties();

        final IndexMetadata indexMetadata = new IndexMetadata(indexManager, p, name, UUID.randomUUID(),
                IndexTypeEnum.BTree);

        /*
         * Override the collator strength property to use the configured
         * value or the default for the text indexer rather than the
         * standard default. This is done because you typically want to
         * recognize only Primary differences for text search while you
         * often want to recognize more differences when generating keys for
         * a B+Tree.
         * 
         * Note: The choice of the language and country for the collator
         * should not matter much for this purpose since the total ordering
         * is not used except to scan all entries for a given term, so the
         * relative ordering between terms does not matter.
         */
        final IKeyBuilderFactory keyBuilderFactory;
        {

            final Properties tmp = new Properties(p);

            tmp.setProperty(KeyBuilder.Options.STRENGTH,
                    p.getProperty(Options.INDEXER_COLLATOR_STRENGTH, Options.DEFAULT_INDEXER_COLLATOR_STRENGTH));

            keyBuilderFactory = new DefaultKeyBuilderFactory(tmp);

        }

        final boolean fieldsEnabled = Boolean
                .parseBoolean(p.getProperty(Options.FIELDS_ENABLED, Options.DEFAULT_FIELDS_ENABLED));

        if (log.isInfoEnabled())
            log.info(Options.FIELDS_ENABLED + "=" + fieldsEnabled);

        //            final boolean doublePrecision = Boolean.parseBoolean(p
        //                    .getProperty(Options.DOUBLE_PRECISION,
        //                            Options.DEFAULT_DOUBLE_PRECISION));
        //    
        //            if (log.isInfoEnabled())
        //                log.info(Options.DOUBLE_PRECISION + "=" + doublePrecision);

        indexMetadata.setTupleSerializer(new FullTextIndexTupleSerializer<V>(keyBuilderFactory, //
                DefaultTupleSerializer.getDefaultLeafKeysCoder(), //
                EmptyRabaValueCoder.INSTANCE, //
                fieldsEnabled//
        ));

        indexManager.registerIndex(indexMetadata);

        if (log.isInfoEnabled())
            log.info("Registered new text index: name=" + name);

        /*
         * Note: defer resolution of the index.
         */
        //            ndx = getIndex(name);

        //        } finally {
        //
        //            unlock(resourceLock);
        //
        //        }

    }

    public void destroy() {

        if (log.isInfoEnabled())
            log.info("");

        assertWritable();

        final IIndexManager indexManager = getIndexManager();

        final IResourceLock resourceLock = acquireExclusiveLock();

        try {

            indexManager.dropIndex(getNamespace() + "." + NAME_SEARCH);

        } finally {

            unlock(resourceLock);

        }

    }

    /**
     * Return the token analyzer to be used for the given language code.
     * 
     * @param languageCode
     *            The language code or <code>null</code> to use the default
     *            {@link Locale}.
     * 
     * @return The token analyzer best suited to the indicated language family.
     */
    protected Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {

        return analyzerFactory.getAnalyzer(languageCode, filterStopwords);

    }

    /**
     * Return a {@link ThreadLocal} {@link IKeyBuilder} instance configured to
     * support full text indexing and search.
     * 
     * @see Options#INDEXER_COLLATOR_STRENGTH
     */
    protected final IKeyBuilder getKeyBuilder() {

        return getIndex().getIndexMetadata().getKeyBuilder();

    }

    /**
     * See {@link #index(TokenBuffer, long, int, String, Reader, boolean)}.
     * <p>
     * Uses a default filterStopwords value of <code>true</code>.
     */
    public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode,
            final Reader r) {

        index(buffer, docId, fieldId, languageCode, r, true/* filterStopwords */);

    }

    /**
     * Index a field in a document.
     * <p>
     * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
     * overflows, then there will be an index write. Once the caller is done
     * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
     * remaining in their <i>buffer</i> to the indices.
     * <p>
     * Note: If a document is pre-existing, then the existing data for that
     * document MUST be removed unless you know that the fields to be found in
     * the will not have changed (they may have different contents, but the same
     * fields exist in the old and new versions of the document).
     * 
     * @param buffer
     *            Used to buffer writes onto the text index.
     * @param docId
     *            The document identifier.
     * @param fieldId
     *            The field identifier.
     * @param languageCode
     *            The language code -or- <code>null</code> to use the default
     *            {@link Locale}.
     * @param r
     *            A reader on the text to be indexed.
     * @param filterStopwords
     *            if true, filter stopwords from the token stream            
     * 
     * @see TokenBuffer#flush()
     */
    public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode,
            final Reader r, final boolean filterStopwords) {

        /*
         * Note: You can invoke this on a read-only index. It is only overflow
         * of the TokenBuffer that requires a writable index. Overflow itself
         * will only occur on {document,field} tuple boundaries, so it will
         * never overflow when indexing a search query.
         */
        //        assertWritable();

        int n = 0;

        // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
        final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords);

        try {

            while (tokenStream.incrementToken()) {

                final TermAttribute term = tokenStream.getAttribute(TermAttribute.class);

                buffer.add(docId, fieldId, term.term());

                n++;

            }

        } catch (IOException ioe) {

            throw new RuntimeException(ioe);

        }

        if (log.isInfoEnabled())
            log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId);

    }

    /**
     * Tokenize text using an {@link Analyzer} that is appropriate to the
     * specified language family.
     * 
     * @param languageCode
     *            The language code -or- <code>null</code> to use the default
     *            {@link Locale}).
     * 
     * @param r
     *            A reader on the text to be indexed.
     *            
     * @param filterStopwords
     *            if true, filter stopwords from the token stream            
     * 
     * @return The extracted token stream.
     */
    protected TokenStream getTokenStream(final String languageCode, final Reader r, final boolean filterStopwords) {

        /*
         * Note: This is stripping out stopwords by default.
         * 
         * @todo is it using a language family specific stopword list?
         */
        final Analyzer a = getAnalyzer(languageCode, filterStopwords);

        TokenStream tokenStream = a.tokenStream(null/* @todo field? */, r);

        // force to lower case.
        tokenStream = new LowerCaseFilter(tokenStream);

        return tokenStream;

    }

    /**
     * Performs a full text search against indexed documents returning a hit
     * list.
     * <p>
     * The basic algorithm computes cosine between the term-frequency vector of
     * the query and the indexed "documents". The cosine may be directly
     * interpreted as the "relevance" of a "document" to the query. The query
     * and document term-frequency vectors are normalized, so the cosine values
     * are bounded in [0.0:1.0]. The higher the cosine the more relevant the
     * document is to the query. A cosine of less than .4 is rarely of any
     * interest.
     * <p>
     * The implementation creates and runs a set parallel tasks, one for each
     * distinct token found in the query, and waits for those tasks to complete
     * or for a timeout to occur. Each task uses a key-range scan on the terms
     * index, collecting metadata for the matching "documents" and aggregating
     * it on a "hit" for that document. Since the tasks run concurrently, there
     * are concurrent writers on the "hits". On a timeout, the remaining tasks
     * are interrupted.
     * <p>
     * The collection of hits is scored and hits that fail a threshold are
     * discarded. The remaining hits are placed into a total order and the
     * caller is returned an iterator which can read from that order. If the
     * operation is interrupted, then only those {@link IHit}s that have already
     * been computed will be returned.
     * 
     * @param query
     *            The query (it will be parsed into tokens).
     * @param languageCode
     *            The language code that should be used when tokenizing the
     *            query -or- <code>null</code> to use the default {@link Locale}
     *            ).
     * @param minCosine
     *            The minimum cosine that will be returned.
     * @param maxCosine
     *            The maximum cosine that will be returned.
    * @param minRank
    *            The min rank of the search result.
    * @param maxRank
    *            The max rank of the search result.
     * @param prefixMatch
     *            When <code>true</code>, the matches will be on tokens which
     *            include the query tokens as a prefix. This includes exact
     *            matches as a special case when the prefix is the entire token,
     *            but it also allows longer matches. For example,
     *            <code>free</code> will be an exact match on <code>free</code>
     *            but a partial match on <code>freedom</code>. When
     *            <code>false</code>, only exact matches will be made.
     * @param matchAllTerms
     *            if true, return only hits that match all search terms
     * @param timeout
     *            The timeout -or- ZERO (0) for NO timeout (this is equivalent
     *            to using {@link Long#MAX_VALUE}).
     * @param unit
     *            The unit in which the timeout is expressed.
     * 
     * @return The hit list.
     * 
     * @todo Allow search within field(s). This will be a filter on the range
     *       iterator that is sent to the data service such that the search
     *       terms are visited only when they occur in the matching field(s).
     */
    public Hiterator<Hit<V>> search(final FullTextQuery query) {

        final Hit<V>[] a = _search(query);

        return new Hiterator<Hit<V>>(a);

    }

    /**
     * Perform a range count on a full text query.
     */
    public int count(final FullTextQuery query) {

        if (cache.containsKey(query)) {

            if (log.isInfoEnabled())
                log.info("found hits in cache");

            return cache.get(query).length;

        } else {

            if (log.isInfoEnabled())
                log.info("did not find hits in cache");

        }

        // tokenize the query.
        final TermFrequencyData<V> qdata = tokenize(query);

        // No terms after stopword extraction
        if (qdata == null) {

            cache.put(query, new Hit[] {});

            return 0;

        }

        /*
         * We can run an optimized version of this (just a quick range count)
         * but only if the caller does not care about exact match and has
         * not specified a regex.
         */
        if (qdata.distinctTermCount() == 1 && !query.isMatchExact() && query.getMatchRegex() == null) {

            final boolean prefixMatch = query.isPrefixMatch();

            final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();

            final String termText = e.getKey();

            final ITermMetadata md = e.getValue();

            final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, prefixMatch,
                    md.getLocalTermWeight(), this);

            return (int) task1.getRangeCount();

        } else {

            final Hit<V>[] a = _search(query);

            return a.length;

        }

    }

    protected TermFrequencyData<V> tokenize(final FullTextQuery query) {

        final String q = query.getQuery();
        final String languageCode = query.getLanguageCode();
        final boolean prefixMatch = query.isPrefixMatch();

        // tokenize the query.
        final TermFrequencyData<V> qdata;
        {

            final TokenBuffer<V> buffer = new TokenBuffer<V>(1, this);

            /*
             * If we are using prefix match ('*' operator) then we don't want to
             * filter stopwords from the search query.
             */
            final boolean filterStopwords = !prefixMatch;

            index(buffer, //
                    null, // docId // was Long.MIN_VALUE
                    Integer.MIN_VALUE, // fieldId
                    languageCode, //
                    new StringReader(q), //
                    filterStopwords//
            );

            if (buffer.size() == 0) {

                /*
                 * There were no terms after stopword extration.
                 */

                log.warn("No terms after stopword extraction: query=" + query);

                return null;

            }

            qdata = buffer.get(0);

            qdata.normalize();

        }

        return qdata;

    }

    public Hit<V>[] _search(final FullTextQuery query) {

        final String queryStr = query.getQuery();
        final String languageCode = query.getLanguageCode();
        final boolean prefixMatch = query.isPrefixMatch();
        final double minCosine = query.getMinCosine();
        final double maxCosine = query.getMaxCosine();
        final int minRank = query.getMinRank();
        final int maxRank = query.getMaxRank();
        final boolean matchAllTerms = query.isMatchAllTerms();
        final boolean matchExact = query.isMatchExact();
        final String regex = query.getMatchRegex();
        long timeout = query.getTimeout();
        final TimeUnit unit = query.getTimeUnit();

        final long begin = System.currentTimeMillis();

        //        if (languageCode == null)
        //            throw new IllegalArgumentException();

        if (queryStr == null)
            throw new IllegalArgumentException();

        if (minCosine < 0d || minCosine > 1d)
            throw new IllegalArgumentException();

        if (minRank <= 0 || maxRank <= 0)
            throw new IllegalArgumentException();

        if (minRank > maxRank)
            throw new IllegalArgumentException();

        if (timeout < 0L)
            throw new IllegalArgumentException();

        if (unit == null)
            throw new IllegalArgumentException();

        if (log.isInfoEnabled())
            log.info("languageCode=[" + languageCode + "], text=[" + queryStr + "], minCosine=" + minCosine
                    + ", maxCosine=" + maxCosine + ", minRank=" + minRank + ", maxRank=" + maxRank
                    + ", matchAllTerms=" + matchAllTerms + ", prefixMatch=" + prefixMatch + ", timeout=" + timeout
                    + ", unit=" + unit);

        if (timeout == 0L) {

            // treat ZERO as equivalent to MAX_LONG.
            timeout = Long.MAX_VALUE;

        }

        final FullTextQuery cacheKey = query;

        Hit<V>[] a;

        if (cache.containsKey(cacheKey)) {

            if (log.isInfoEnabled())
                log.info("found hits in cache");

            a = cache.get(cacheKey);

        } else {

            if (log.isInfoEnabled())
                log.info("did not find hits in cache");

            // tokenize the query.
            final TermFrequencyData<V> qdata = tokenize(query);

            // No terms after stopword extraction
            if (qdata == null) {

                cache.put(cacheKey, a = new Hit[] {});

                return a;

            }

            a = executeQuery(qdata, prefixMatch, timeout, unit);

            if (a.length == 0) {

                log.info("No hits: languageCode=[" + languageCode + "], query=[" + queryStr + "]");

                cache.put(cacheKey, a);

                return a;

            }

            /*
             * If match all is specified, remove any hits with a term count less
             * than the number of search tokens.  It's also an optimization to
             * run the pruning if we're going to do matchExact.
             */
            if ((matchAllTerms || matchExact) && qdata.distinctTermCount() > 1) {

                final int nterms = qdata.terms.size();

                if (log.isInfoEnabled()) {
                    log.info("matchAll=true, nterms=" + nterms);
                    log.info("size before: " + a.length);
                }

                final Hit<V>[] tmp = new Hit[a.length];

                int i = 0;
                for (Hit<V> hit : a) {

                    if (hit.getTermCount() == nterms) {
                        tmp[i++] = hit;
                    }

                }

                if (log.isDebugEnabled()) {
                    log.debug(i);
                }

                if (i < a.length) {

                    a = new Hit[i];
                    System.arraycopy(tmp, 0, a, 0, i);

                }

            }

            /*
             * Delegate match exact to subclasses.
             */
            if (matchExact) {

                a = matchExact(a, queryStr);

            }

            if (a.length == 0) {

                log.warn("No hits after matchAllTerms pruning: languageCode=[" + languageCode + "], query=["
                        + queryStr + "]");

                cache.put(cacheKey, a);

                return a;

            }

            /*
             * Do regex matching.
             */
            if (regex != null) {

                final Pattern pattern = Pattern.compile(regex);//, Pattern.CASE_INSENSITIVE);

                if (log.isDebugEnabled()) {
                    log.debug("hits before regex: " + a.length);
                }

                a = applyRegex(a, pattern);

                if (log.isDebugEnabled()) {
                    log.debug("hits after regex: " + a.length);
                }

            }

            if (a.length == 0) {

                log.warn("No hits after regex pruning: languageCode=[" + languageCode + "], query=[" + queryStr
                        + "], regex=[" + regex + "]");

                cache.put(cacheKey, a);

                return a;

            }

            /*
             * Rank order the hits by relevance.
             * 
             * @todo consider moving documents through a succession of N pools where
             * N is the #of distinct terms in the query. The read tasks would halt
             * if the size of the pool for N terms reached maxRank. This might (or
             * might not) help with triage since we could process hits by pool and
             * only compute the cosines for one pool at a time until we had enough
             * hits.
             */

            if (log.isInfoEnabled())
                log.info("Rank ordering " + a.length + " hits by relevance");

            final long start = System.currentTimeMillis();

            Arrays.sort(a);

            if (log.isInfoEnabled()) {
                final long sortTime = System.currentTimeMillis() - start;
                log.info("sort time: " + sortTime);
            }

            for (int i = 0; i < a.length; i++) {
                a[i].setRank(i + 1);
            }

            cache.put(cacheKey, a);

        }

        /*
         * Take a slice of the hits based on min/max cosine and min/max rank.
         */
        a = slice(query, a);

        final long elapsed = System.currentTimeMillis() - begin;

        if (log.isInfoEnabled())
            log.info("Done: " + a.length + " hits in " + elapsed + "ms");

        return a;

    }

    protected Hit<V>[] slice(final FullTextQuery query, Hit<V>[] a) {

        final double minCosine = query.getMinCosine();
        final double maxCosine = query.getMaxCosine();
        final int minRank = query.getMinRank();
        final int maxRank = query.getMaxRank();

        //        if (log.isDebugEnabled()) {
        //           log.debug("before min/max cosine/rank pruning:");
        //           for (Hit<V> h : a)
        //              log.debug(h);
        //        }

        /*
         * If maxCosine is specified, prune the hits that are above the max
         */
        if (maxCosine < 1.0d) {

            // find the first occurrence of a hit that is <= maxCosine
            int i = 0;
            for (Hit<V> h : a) {
                if (h.getCosine() <= maxCosine)
                    break;
                i++;
            }

            // no hits with relevance less than maxCosine
            if (i == a.length) {

                return new Hit[] {};

            } else {

                // copy the hits from that first occurrence to the end
                final Hit<V>[] tmp = new Hit[a.length - i];
                System.arraycopy(a, i, tmp, 0, tmp.length);

                a = tmp;

            }

        }

        /*
         * If minCosine is specified, prune the hits that are below the min
         */
        if (minCosine > 0.0d) {

            // find the first occurrence of a hit that is < minCosine
            int i = 0;
            for (Hit<V> h : a) {
                if (h.getCosine() < minCosine)
                    break;
                i++;
            }

            // no hits with relevance greater than minCosine
            if (i == 0) {

                return new Hit[] {};

            } else if (i < a.length) {

                // copy the hits from 0 up to that first occurrence
                final Hit<V>[] tmp = new Hit[i];
                System.arraycopy(a, 0, tmp, 0, tmp.length);

                a = tmp;

            }

        }

        // exactly one hit
        if (minRank > 0 && minRank == maxRank) {

            if (minRank > a.length) {

                // out of range
                return new Hit[] {};

            } else {

                // in range
                return new Hit[] { a[minRank - 1] };

            }

        }

        /*
         * If minRank is specified, prune the hits that rank higher than the min
         */
        if (minRank > 1) {

            // no hits above minRank
            if (minRank > a.length) {

                return new Hit[] {};

            } else {

                // copy the hits from the minRank to the end
                final Hit<V>[] tmp = new Hit[a.length - (minRank - 1)];
                System.arraycopy(a, minRank - 1, tmp, 0, tmp.length);

                a = tmp;

            }

        }

        final int newMax = maxRank - minRank + 1;

        if (log.isDebugEnabled())
            log.debug("new max rank: " + newMax);

        /*
         * If maxRank is specified, prune the hits that rank lower than the max
         */
        if (newMax < a.length) {

            // copy the hits from the minRank to the end
            final Hit<V>[] tmp = new Hit[newMax];
            System.arraycopy(a, 0, tmp, 0, tmp.length);

            a = tmp;

        }

        return a;

    }

    protected Hit<V>[] executeQuery(final TermFrequencyData<V> qdata, final boolean prefixMatch, final long timeout,
            final TimeUnit unit) {

        final IHitCollector<V> hits;

        if (qdata.distinctTermCount() == 1) {

            final Map.Entry<String, ITermMetadata> e = qdata.getSingletonEntry();

            final String termText = e.getKey();

            final ITermMetadata md = e.getValue();

            final CountIndexTask<V> task1 = new CountIndexTask<V>(termText, 0, 1, prefixMatch,
                    md.getLocalTermWeight(), this);

            hits = new SingleTokenHitCollector<V>(task1);

        } else {

            final List<CountIndexTask<V>> tasks = new ArrayList<CountIndexTask<V>>(qdata.distinctTermCount());

            int i = 0;
            for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {

                final String termText = e.getKey();

                final ITermMetadata md = e.getValue();

                tasks.add(new CountIndexTask<V>(termText, i++, qdata.terms.size(), prefixMatch,
                        md.getLocalTermWeight(), this));

            }

            hits = new MultiTokenHitCollector<V>(tasks);

        }

        // run the queries.
        {

            final List<Callable<Object>> tasks = new ArrayList<Callable<Object>>(qdata.distinctTermCount());

            int i = 0;
            for (Map.Entry<String, ITermMetadata> e : qdata.terms.entrySet()) {

                final String termText = e.getKey();

                final ITermMetadata md = e.getValue();

                tasks.add(new ReadIndexTask<V>(termText, i++, qdata.terms.size(), prefixMatch,
                        md.getLocalTermWeight(), this, hits));

            }

            final ExecutionHelper<Object> executionHelper = new ExecutionHelper<Object>(getExecutorService(),
                    timeout, unit);

            try {

                final long start = System.currentTimeMillis();

                executionHelper.submitTasks(tasks);

                if (log.isInfoEnabled()) {
                    final long readTime = System.currentTimeMillis() - start;
                    log.info("read time: " + readTime);
                }

            } catch (InterruptedException ex) {

                if (log.isInfoEnabled()) {
                    // TODO Should we wrap and toss this interrupt instead?
                    log.info("Interrupted - only partial results will be returned.");
                }

                /*
                 * Yes, let's toss it.  We were getting into a situation
                 * where the ExecutionHelper above received an interrupt
                 * but we still went through the heavy-weight filtering
                 * operations below (matchExact or matchRegex).
                 */
                throw new RuntimeException(ex);

            } catch (ExecutionException ex) {

                throw new RuntimeException(ex);

            }

        }

        return hits.getHits();

    }

    /**
     * Subclasses can override this method to do exact match processing.  This
     * involves materializing the hits into their original text values and
     * checking for the query string in the materialized value.  Not possible
     * from the base class.  The value-centric RDF version can use the
     * lexicon to materialize the hits and check them for exact match.
     */
    protected Hit<V>[] matchExact(final Hit<V>[] hits, final String query) {

        throw new UnsupportedOperationException();

    }

    /**
     * Subclasses can override this method to do regex post-processing. This
     * involves materializing the hits into their original text values and
     * checking against the regex string in the materialized value. Not possible
     * from the base class. The value-centric RDF version can use the lexicon to
     * materialize the hits and check them against the regex.
     */
    protected Hit<V>[] applyRegex(final Hit<V>[] hits, final Pattern regex) {

        throw new UnsupportedOperationException();

    }

    /*
     * @todo implement the relevant methods.
     */

    @SuppressWarnings("unchecked")
    public long delete(IChunkedOrderedIterator itr) {
        throw new UnsupportedOperationException();
    }

    @SuppressWarnings("unchecked")
    public long insert(IChunkedOrderedIterator itr) {
        throw new UnsupportedOperationException();
    }

    public Set<String> getIndexNames() {
        throw new UnsupportedOperationException();
    }

    public Iterator getKeyOrders() {
        throw new UnsupportedOperationException();
    }

    @SuppressWarnings("unchecked")
    public Object newElement(List a, IBindingSet bindingSet) {
        throw new UnsupportedOperationException();
    }

    public Class<?> getElementClass() {
        throw new UnsupportedOperationException();
    }

    public IKeyOrder getPrimaryKeyOrder() {
        throw new UnsupportedOperationException();
    }

    public IKeyOrder getKeyOrder(IPredicate p) {
        throw new UnsupportedOperationException();
    }

}