lux.index.analysis.XmlTokenStreamBase.java Source code

Introduction

Here is the source code for lux.index.analysis.XmlTokenStreamBase.java
Source

package lux.index.analysis;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;

import lux.index.IndexConfiguration;
import lux.index.XmlIndexer;
import lux.index.attribute.QNameAttribute;

import net.sf.saxon.om.NamePool;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.XdmNode;
import net.sf.saxon.s9api.XdmSequenceIterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

/**
 * <p>
 * This is the root of a set of xml-aware TokenStream classes that work by selecting text
 * a node at a time from an XML document, and then 
 * passing that text to the wrapped TokenStream.  The wrapped TokenStream is re-used for each text node.
 * The outermost link in the chain will be a TokenFilter that applies a sequence of structure-related 
 * Attributes to each text token (ie a list of QNames, but can be any kind of structural attribute
 * that should be composed with each text token).
 * <p>
 * The token stream topology is: this( this.wrapped (this.tokenizer ))
 * For example, for the element-text field we have ElementTokenStream (a subclass of this class):
 * </p>
 * <blockquote>
 * <code>ElementTokenStream (QNameTokenFilter (LowerCaseFilter (StandardTokenizer)))</code>
 * </blockquote>
 * <p>
 * We can't follow the standard Lucene pattern of Analyzer as a factory for a TokenStream
 * since we want to be able to extend any arbitrary textual Analyzer, but the constraints 
 * of the Analyzer class design prevent it from being extended in a straightforward manner.
 * Thus we have essentially an outer (XML) stream wrapping an inner (Text) stream.
 * </p>
 * 
 * FIXME: make the constructor protected; allow construction only through static builders
 * defined on each derived class.  This will enable us to hide the complexity of wrapping the
 * token stream, which is the same pattern for each of these; only the classes vary.  But we 
 * can't do the work in the constructor due to Java structural issues.
 */
public abstract class XmlTokenStreamBase extends TokenStream {

    private final String fieldName;
    // The analyzer creates the wrapped TokenStream/Tokenizer that does the text analysis
    private final Analyzer analyzer;
    private TokenStream wrapped;
    protected XdmNode curNode;
    protected Iterator<XdmNode> contentIter; // retrieves the nodes with text to index
    protected CharTermAttribute termAtt;
    protected Reader charStream = new OffsetCharFilter(new StringReader(""));
    protected ElementVisibility defVis;
    protected HashMap<Integer, ElementVisibility> eltVis;
    protected final QNameAttribute qnameAtt;
    protected final QNameTokenFilter qnameTokenFilter;
    // protected EmptyTokenStream empty;
    protected static final XdmSequenceIterator EMPTY = new EmptyXdmIterator(null);

    XmlTokenStreamBase(String fieldName, Analyzer analyzer, TokenStream wrapped, Processor processor) {
        super(wrapped);
        this.wrapped = wrapped;
        this.fieldName = fieldName;
        this.analyzer = analyzer;
        termAtt = addAttribute(CharTermAttribute.class);
        // empty = new EmptyTokenStream(wrapped);
        eltVis = new HashMap<Integer, ElementVisibility>();
        // FIXME - don't use QNameTokenFilter for this -- that handles prefixing tokens
        // use instead an XmlVisibilityFilter that encapsulatres the logic currently in ElementTokenStream
        if (wrapped instanceof QNameTokenFilter) {
            qnameTokenFilter = (QNameTokenFilter) wrapped;
            defVis = qnameTokenFilter.getDefaultVisibility();
            NamePool namePool = processor.getUnderlyingConfiguration().getNamePool();
            for (Entry<String, ElementVisibility> entry : qnameTokenFilter.getElementVisibility().entrySet()) {
                int namecode = namePool.allocateClarkName(entry.getKey());
                eltVis.put(namecode, entry.getValue());
            }
        } else {
            defVis = ElementVisibility.OPAQUE;
            qnameTokenFilter = new QNameTokenFilter(getWrappedTokenStream());
        }
        qnameAtt = qnameTokenFilter.addAttribute(QNameAttribute.class);
    }

    @Override
    public void reset() throws IOException {
        reset(charStream);
        wrapped.reset();
    }

    @Override
    public void close() throws IOException {
        wrapped.close();
    }

    public void reset(Reader reader) throws IOException {
        close();
        TokenStream reset = analyzer.tokenStream(fieldName, reader);
        // This must be the same token stream: ie the Analyzer must be re-usable, and the 
        // original token stream must have arisen from it.  We don't check for actual
        // identity with wrapped since that might get wrapped again (eg w/QNameTokenFilter).
        assert (reset.getAttribute(CharTermAttribute.class) == wrapped.getAttribute(CharTermAttribute.class));
    }

    /*
     * Advance the iteration by looping through the following:
     * 1) next text node
     * 2) next token in text
     * 3) next ancestor element node
     * @see org.apache.lucene.analysis.TokenStream#incrementToken()
     */
    @Override
    public boolean incrementToken() throws IOException {
        if (!incrementWrappedTokenStream()) { // next token in current node
            if (!advanceToTokenNode()) { // next node with a token
                return false;
            }
        }
        return true;
    }

    /**
     * @return the underlying stream of text tokens to which additional xml-related attributes are added by this.
     */
    public TokenStream getWrappedTokenStream() {
        return wrapped;
    }

    protected void setWrappedTokenStream(TokenStream wrapped) {
        this.wrapped = wrapped;
    }

    protected boolean incrementWrappedTokenStream() throws IOException {
        while (wrapped.incrementToken()) {
            if (termAtt.length() > 0) {
                return true;
            }
        }
        return false;
    }

    private boolean advanceToTokenNode() {
        while (contentIter.hasNext()) {
            curNode = (XdmNode) contentIter.next();
            // wrap the content in a reader and hand it to the tokenizer
            NodeInfo nodeInfo = curNode.getUnderlyingNode();
            if (!updateNodeAtts()) {
                continue;
            }
            if (resetTokenizer(nodeInfo.getStringValueCS())) {
                return true;
            }
        }
        return false;
    }

    abstract boolean resetTokenizer(CharSequence cs);

    /** @return false if the node is hidden */
    abstract boolean updateNodeAtts();

    /**
     * @param clarkName the name of an element as a clarkName ({namespace}name)
     * @return the explicitly-specified visibility of the element name, or null if the element has the default
     * visibility.
     */
    public ElementVisibility getElementVisibility(String clarkName) {
        return eltVis.get(clarkName);
    }

    /**
     * @param namecode the name of an element as a namecode from a {@link net.sf.saxon.om.NamePool}
     * @param visibility the explicitly-specified visibility of the element name, or null to give the element the default
     * visibility.
     */
    public void setElementVisibility(int namecode, ElementVisibility visibility) {
        if (visibility == null) {
            eltVis.remove(namecode);
        } else {
            eltVis.put(namecode, visibility);
        }
    }

    /** @return the visibility of elements not explicitly specified using setElementVisibility.
     * Always {@link ElementVisibility#OPAQUE}.
     */
    public ElementVisibility getDefaultVisibility() {
        return defVis;
    }

    public void setDefaultVisibility(ElementVisibility vis) {
        this.defVis = vis;
    }

    public void configureElementVisibility(XmlIndexer indexer) {
        IndexConfiguration config = indexer.getConfiguration();
        if (qnameTokenFilter != null) {
            qnameTokenFilter.setNamespaceAware(config.isOption(IndexConfiguration.NAMESPACE_AWARE));
        }
        NamePool namePool = indexer.getProcessor().getUnderlyingConfiguration().getNamePool();
        if (defVis == null) {
            defVis = config.getDefaultVisibility();
        }
        for (Entry<String, ElementVisibility> e : config.getVisibilityMap().entrySet()) {
            int namecode = namePool.allocateClarkName(e.getKey());
            if (!eltVis.containsKey(namecode)) {
                eltVis.put(namecode, e.getValue());
            }
        }
    }
}

/*
 * This Source Code Form is subject to the terms of the Mozilla Public License,
 * v. 2.0. If a copy of the MPL was not distributed with this file, You can
 * obtain one at http://mozilla.org/MPL/2.0/.
 */