HtmlConverter.java :  » Portal » Open-Portal » com » sun » portal » wireless » htmlconversion » Java Open Source

Java Open Source » Portal » Open Portal 
Open Portal » com » sun » portal » wireless » htmlconversion » HtmlConverter.java
/*
 * Created on Feb 9, 2005
 *
 */
package com.sun.portal.wireless.htmlconversion;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;

import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.swing.text.html.parser.ParserDelegator;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;

import com.sun.portal.wireless.htmlconversion.servlet.URLTranscoder;
import com.sun.portal.log.common.PortalLogger;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Public API for this package that converts HTML input to AML output.
 * 
 * @author ashwin.mathew@sun.com
 */
public class HtmlConverter {

    /**
     * Unknown whether the document is HTML or XHTML, the API will try to
     * determine the document type.
     */
    public static final int DOCUMENT_TYPE_UNKNOWN = 0;

    /**
     * Force transformation of the document as HTML.
     */
    public static final int DOCUMENT_TYPE_HTML = 1;

    /**
     * Force transformation of the document as XHTML.
     */
    public static final int DOCUMENT_TYPE_XHTML = 2;

    // The input HTML
    private String input;

    // The output AML
    private Document output;

    private URLTranscoder encoder;

    private boolean isFragment;

    private int documentType;

    private boolean isTransformed = false;

    private static final String AML_PAGE_START_TAG = "<AmlPage>";

    private static final String AML_PAGE_END_TAG = "</AmlPage>";

    private static final int AML_PAGE_START_TAG_LENGTH = AML_PAGE_START_TAG.length();

    private static final String DTD_START = "<!DOCTYPE";

    private static final char DTD_END = '>';

    private static final String XHTML_UPPER = "XHTML";

    private static final String XHTML_LOWER = "xhtml";

    private static final String FEATURE_VALIDATION = "http://xml.org/sax/features/validation";

    private static final String FEATURE_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd";

    private static final Logger logger = PortalLogger.getLogger("com.sun.portal.wireless.htmlconversion");

    /**
     * Constructs a new HtmlConverter which tries to determine the document type
     * itself.
     * 
     * @param input
     *            The input HTML content to be transformed.
     * @param isFragment
     *            Whether or not the output AML content is a whole AML page
     *            (with AmlDocument and AmlPage tags) or is just a fragment of
     *            AML to be embedded on a larger AML page.
     */
    public HtmlConverter(String input, boolean isFragment) throws HtmlConversionException {
        this(input, isFragment, DOCUMENT_TYPE_UNKNOWN);
    }

    /**
     * Constructs a new HtmlConverter for the specified document type.
     * 
     * @param input
     *            The input HTML content to be transformed.
     * @param isFragment
     *            Whether or not the output AML content is a whole AML page
     *            (with AmlDocument and AmlPage tags) or is just a fragment of
     *            AML to be embedded on a larger AML page.
     * @param documentType
     *            The type of the document, HTML, XHTML or unknown, must be one
     *            of the DOCUMENT_TYPE_* constants defined on this class.
     */
    public HtmlConverter(String input, boolean isFragment, int documentType) throws HtmlConversionException {
        this.input = input;
        this.isFragment = isFragment;
        this.documentType = documentType;

        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("Transforming HTML [" + input + "]");
        }

        try {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = factory.newDocumentBuilder();
            output = builder.newDocument();
        } catch (Exception ex) {
            // ex.printStackTrace();
            logger.log(Level.SEVERE, "Error converting HTML", ex);
            throw new HtmlConversionException(HtmlConversionException.XML_ERROR, ex);
        }

        if (documentType == DOCUMENT_TYPE_UNKNOWN) {
            checkDocumentType();
        }
    }

    /**
     * Creates and sets the URLEncoder.
     * 
     * @param request
     * @param response
     */
    public void setEncoder(HttpServletRequest request, HttpServletResponse response) {
        encoder = new URLTranscoder(request, response);
    }

    // Determines whether the input document is HTML or XHTML
    // The current mechanism only checks for the presence of
    // the string "XHTML" or "xhtml" in the opening DTD specification.
    // This may have to be reimplemented later to be a little more
    // sophisticated, for example, by checking whether or not image
    // and input tags in the document have a closing "/>" instead of
    // just ">".
    private void checkDocumentType() {
        documentType = DOCUMENT_TYPE_HTML;

        if (input.startsWith(DTD_START)) {
            int endIndex = input.indexOf(DTD_END);
            String dtd = input.substring(0, endIndex);

            if (dtd.indexOf(XHTML_UPPER) != -1 || dtd.indexOf(XHTML_LOWER) != -1) {
                documentType = DOCUMENT_TYPE_XHTML;
            }
        }
    }

    /**
     * Returns the transformed AML output.
     */
    public String toAML() throws HtmlConversionException {
        transform();

        TransformerFactory tFactory = TransformerFactory.newInstance();

        Transformer transformer = null;
        try {
            transformer = tFactory.newTransformer();
        } catch (TransformerConfigurationException tce) {
            // tce.printStackTrace();
            logger.log(Level.SEVERE, "Error converting HTML", tce);
            throw new HtmlConversionException(HtmlConversionException.XML_ERROR, tce);
        }

        DOMSource source = new DOMSource(output);

        StringWriter amlDoc = new StringWriter();
        StreamResult result = new StreamResult(amlDoc);

        try {
            transformer.transform(source, result);
        } catch (TransformerException te) {
            // te.printStackTrace();
            logger.log(Level.SEVERE, "Error converting HTML", te);
            throw new HtmlConversionException(HtmlConversionException.XML_ERROR, te);
        }

        String amlOutput = amlDoc.toString();

        if (isFragment) {
            // Rip off the AmlDocument and AmlPage tags
            int amlPageStartIndex = amlOutput.indexOf(AML_PAGE_START_TAG);
            if(amlPageStartIndex != -1){
            amlOutput = amlOutput.substring(amlPageStartIndex + AML_PAGE_START_TAG_LENGTH);

            int amlPageEndIndex = amlOutput.lastIndexOf(AML_PAGE_END_TAG);
            amlOutput = amlOutput.substring(0, amlPageEndIndex);
            }
        }

        if (logger.isLoggable(Level.FINEST)) {
            logger.finest("Transformed HTML to AML [" + amlOutput + "]");
        }

        return amlOutput;
    }

    private void transform() throws HtmlConversionException {
        if (isTransformed) {
            return;
        }

        // Assume regular HTML parser for now
        // Will add XHTML handling in later

        ParserState state = new ParserState(output, encoder);
        GenericHtmlParserCallback genericCallback = new GenericHtmlParserCallback(state);

        // Check documentType and proceed.
        if (documentType == DOCUMENT_TYPE_HTML) {
            StringReader inputReader = new StringReader(input);

            HtmlParserCallback callback = new HtmlParserCallback(genericCallback);

            try {
                new ParserDelegator().parse(inputReader, callback, true);
            } catch (Exception e) {
                // e.printStackTrace();
                logger.log(Level.SEVERE, "Error converting HTML", e);
                throw new HtmlConversionException(HtmlConversionException.TRANSFORMATION_ERROR, e);
            }
        } else // documentType == DOCUMENT_TYPE_XHTML
        {
            XhtmlParserCallback callback = new XhtmlParserCallback(genericCallback);

            XMLReader parser = null;

            try {
                parser = XMLReaderFactory.createXMLReader();
            } catch (SAXException saxEx) {
                // saxEx.printStackTrace();
                logger.log(Level.SEVERE, "Error converting HTML", saxEx);
                throw new HtmlConversionException(HtmlConversionException.XML_ERROR, saxEx);
            }

            parser.setContentHandler(callback);
            parser.setDTDHandler(callback);
            parser.setEntityResolver(callback);
            parser.setErrorHandler(callback);

            try {
                parser.setFeature(FEATURE_VALIDATION, false);
                parser.setFeature(FEATURE_LOAD_EXTERNAL_DTD, false);
            } catch (SAXNotRecognizedException saxEx) {
                // Ignore these exceptions, and attempt to
                // continue processing
                // saxEx.printStackTrace();
                logger.log(Level.WARNING, "Error converting HTML", saxEx);
            } catch (SAXNotSupportedException saxEx) {
                // Ignore these exceptions, and attempt to
                // continue processing
                // saxEx.printStackTrace();
                logger.log(Level.WARNING, "Error converting HTML", saxEx);
            }

            InputSource inputSource = new InputSource(new StringReader(input));

            try {
                parser.parse(inputSource);
            } catch (SAXException saxEx) {
                // saxEx.printStackTrace();
                logger.log(Level.SEVERE, "Error converting HTML", saxEx);
                throw new HtmlConversionException(HtmlConversionException.TRANSFORMATION_ERROR, saxEx);
            } catch (IOException ioEx) {
                // ioEx.printStackTrace();
                logger.log(Level.SEVERE, "Error converting HTML", ioEx);
                throw new HtmlConversionException(HtmlConversionException.TRANSFORMATION_ERROR, ioEx);
            }
        }

        // Now flatten the tables and reform document structure
        state.getLayoutManager().reformLayout();

        isTransformed = true;
    }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.