net.bible.service.format.osistohtml.osishandlers.OsisToCanonicalTextSaxHandler.java Source code

Java tutorial

Introduction

Here is the source code for net.bible.service.format.osistohtml.osishandlers.OsisToCanonicalTextSaxHandler.java

Source

package net.bible.service.format.osistohtml.osishandlers;

import net.bible.service.common.Logger;
import net.bible.service.format.osistohtml.taghandler.TagHandlerHelper;

import org.apache.commons.lang.StringUtils;
import org.crosswire.jsword.book.OSISUtil;
import org.xml.sax.Attributes;

import java.util.Stack;

/**
 * Convert OSIS input into Canonical text (used when creating search index)
 * 
 * @author Martin Denham [mjdenham at gmail dot com]
 * @see gnu.lgpl.License for license details.<br>
 *      The copyright to this program is held by it's author.
 */
public class OsisToCanonicalTextSaxHandler extends OsisSaxHandler {

    @SuppressWarnings("unused")
    private int currentVerseNo;

    private Stack<CONTENT_STATE> writeContentStack = new Stack<>();

    private enum CONTENT_STATE {
        WRITE, IGNORE
    }

    // Avoid space at the start and, extra space between words
    private boolean spaceJustWritten = true;

    private static final Logger log = new Logger("OsisToCanonicalTextSaxHandler");

    public OsisToCanonicalTextSaxHandler() {
        super();
    }

    @Override
    public void startDocument() {
        reset();
        // default mode is to write
        writeContentStack.push(CONTENT_STATE.WRITE);
    }

    /*
     *Called when the Parser Completes parsing the Current XML File.
    */
    @Override
    public void endDocument() {
        // pop initial value
        writeContentStack.pop();

        // assert
        if (!writeContentStack.isEmpty()) {
            log.warn("OsisToCanonicalTextSaxHandler context stack should now be empty");
        }
    }

    /*
     * Called when the starting of the Element is reached. For Example if we have Tag
     * called <Title> ... </Title>, then this method is called when <Title> tag is
     * Encountered while parsing the Current XML File. The AttributeList Parameter has
     * the list of all Attributes declared for the Current Element in the XML File.
    */
    @Override
    public void startElement(String namespaceURI, String sName, // simple name
            String qName, // qualified name
            Attributes attrs) {
        String name = getName(sName, qName); // element name

        debug(name, attrs, true);

        // if encountering either a verse tag or if the current tag is marked as being canonical then turn on writing
        if (isAttrValue(attrs, "canonical", "true")) {
            writeContentStack.push(CONTENT_STATE.WRITE);
        } else if (name.equals(OSISUtil.OSIS_ELEMENT_VERSE)) {
            if (attrs != null) {
                currentVerseNo = TagHandlerHelper.osisIdToVerseNum(attrs.getValue("", OSISUtil.OSIS_ATTR_OSISID));
            }
            writeContentStack.push(CONTENT_STATE.WRITE);
        } else if (name.equals(OSISUtil.OSIS_ELEMENT_NOTE)) {
            writeContentStack.push(CONTENT_STATE.IGNORE);
        } else if (name.equals(OSISUtil.OSIS_ELEMENT_TITLE)) {
            writeContentStack.push(CONTENT_STATE.IGNORE);
        } else if (name.equals(OSISUtil.OSIS_ELEMENT_REFERENCE)) {
            // text content of top level references should be output but in notes it should not
            writeContentStack.push(writeContentStack.peek());
        } else if (name.equals(OSISUtil.OSIS_ELEMENT_L) || name.equals(OSISUtil.OSIS_ELEMENT_LB)
                || name.equals(OSISUtil.OSIS_ELEMENT_P)) {
            // these occur in Psalms to separate different paragraphs.  
            // A space is needed for TTS not to be confused by punctuation with a missing space like 'toward us,and the'
            write(" ");
            //if writing then continue.  Also if ignoring then continue
            writeContentStack.push(writeContentStack.peek());
        } else {
            // unknown tags rely on parent tag to determine if content is canonical e.g. the italic tag in the middle of canonical text
            writeContentStack.push(writeContentStack.peek());
        }
    }

    /*
     * Called when the Ending of the current Element is reached. For example in the
     * above explanation, this method is called when </Title> tag is reached
    */
    @Override
    public void endElement(String namespaceURI, String sName, // simple name
            String qName // qualified name
    ) {
        String name = getName(sName, qName);
        debug(name, null, false);
        if (name.equals(OSISUtil.OSIS_ELEMENT_VERSE)) {
            // A space is needed to separate one verse from the next, otherwise the 2 verses butt up against each other
            // which looks bad and confuses TTS
            write(" ");
        }

        // now this tag has ended pop the write/ignore state for the parent tag
        writeContentStack.pop();
    }

    /*
     * Handle characters encountered in tags
    */
    @Override
    public void characters(char buf[], int offset, int len) {
        if (CONTENT_STATE.WRITE.equals(writeContentStack.peek())) {
            String s = new String(buf, offset, len);
            write(s);
        }
    }

    @Override
    protected void write(String s) {
        // reduce amount of whitespace becasue a lot of space was occurring between verses in ESVS and several other books
        if (!StringUtils.isWhitespace(s)) {
            super.write(s);
            spaceJustWritten = false;
        } else if (!spaceJustWritten) {
            super.write(" ");
            spaceJustWritten = true;
        }
    }

    protected void writeContent(boolean writeContent) {
        if (writeContent) {
            writeContentStack.push(CONTENT_STATE.WRITE);
        } else {
            writeContentStack.push(CONTENT_STATE.IGNORE);
        }
    }
}