/*
* Created on Feb 9, 2005
*
*/
package com.sun.portal.wireless.htmlconversion;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.swing.text.html.parser.ParserDelegator;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import com.sun.portal.wireless.htmlconversion.servlet.URLTranscoder;
import com.sun.portal.log.common.PortalLogger;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Public API for this package that converts HTML input to AML output.
*
* @author ashwin.mathew@sun.com
*/
public class HtmlConverter {
/**
* Unknown whether the document is HTML or XHTML, the API will try to
* determine the document type.
*/
public static final int DOCUMENT_TYPE_UNKNOWN = 0;
/**
* Force transformation of the document as HTML.
*/
public static final int DOCUMENT_TYPE_HTML = 1;
/**
* Force transformation of the document as XHTML.
*/
public static final int DOCUMENT_TYPE_XHTML = 2;
// The input HTML
private String input;
// The output AML
private Document output;
private URLTranscoder encoder;
private boolean isFragment;
private int documentType;
private boolean isTransformed = false;
private static final String AML_PAGE_START_TAG = "<AmlPage>";
private static final String AML_PAGE_END_TAG = "</AmlPage>";
private static final int AML_PAGE_START_TAG_LENGTH = AML_PAGE_START_TAG.length();
private static final String DTD_START = "<!DOCTYPE";
private static final char DTD_END = '>';
private static final String XHTML_UPPER = "XHTML";
private static final String XHTML_LOWER = "xhtml";
private static final String FEATURE_VALIDATION = "http://xml.org/sax/features/validation";
private static final String FEATURE_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd";
private static final Logger logger = PortalLogger.getLogger("com.sun.portal.wireless.htmlconversion");
/**
* Constructs a new HtmlConverter which tries to determine the document type
* itself.
*
* @param input
* The input HTML content to be transformed.
* @param isFragment
* Whether or not the output AML content is a whole AML page
* (with AmlDocument and AmlPage tags) or is just a fragment of
* AML to be embedded on a larger AML page.
*/
public HtmlConverter(String input, boolean isFragment) throws HtmlConversionException {
this(input, isFragment, DOCUMENT_TYPE_UNKNOWN);
}
/**
* Constructs a new HtmlConverter for the specified document type.
*
* @param input
* The input HTML content to be transformed.
* @param isFragment
* Whether or not the output AML content is a whole AML page
* (with AmlDocument and AmlPage tags) or is just a fragment of
* AML to be embedded on a larger AML page.
* @param documentType
* The type of the document, HTML, XHTML or unknown, must be one
* of the DOCUMENT_TYPE_* constants defined on this class.
*/
public HtmlConverter(String input, boolean isFragment, int documentType) throws HtmlConversionException {
this.input = input;
this.isFragment = isFragment;
this.documentType = documentType;
if (logger.isLoggable(Level.FINEST)) {
logger.finest("Transforming HTML [" + input + "]");
}
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
output = builder.newDocument();
} catch (Exception ex) {
// ex.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", ex);
throw new HtmlConversionException(HtmlConversionException.XML_ERROR, ex);
}
if (documentType == DOCUMENT_TYPE_UNKNOWN) {
checkDocumentType();
}
}
/**
* Creates and sets the URLEncoder.
*
* @param request
* @param response
*/
public void setEncoder(HttpServletRequest request, HttpServletResponse response) {
encoder = new URLTranscoder(request, response);
}
// Determines whether the input document is HTML or XHTML
// The current mechanism only checks for the presence of
// the string "XHTML" or "xhtml" in the opening DTD specification.
// This may have to be reimplemented later to be a little more
// sophisticated, for example, by checking whether or not image
// and input tags in the document have a closing "/>" instead of
// just ">".
private void checkDocumentType() {
documentType = DOCUMENT_TYPE_HTML;
if (input.startsWith(DTD_START)) {
int endIndex = input.indexOf(DTD_END);
String dtd = input.substring(0, endIndex);
if (dtd.indexOf(XHTML_UPPER) != -1 || dtd.indexOf(XHTML_LOWER) != -1) {
documentType = DOCUMENT_TYPE_XHTML;
}
}
}
/**
* Returns the transformed AML output.
*/
public String toAML() throws HtmlConversionException {
transform();
TransformerFactory tFactory = TransformerFactory.newInstance();
Transformer transformer = null;
try {
transformer = tFactory.newTransformer();
} catch (TransformerConfigurationException tce) {
// tce.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", tce);
throw new HtmlConversionException(HtmlConversionException.XML_ERROR, tce);
}
DOMSource source = new DOMSource(output);
StringWriter amlDoc = new StringWriter();
StreamResult result = new StreamResult(amlDoc);
try {
transformer.transform(source, result);
} catch (TransformerException te) {
// te.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", te);
throw new HtmlConversionException(HtmlConversionException.XML_ERROR, te);
}
String amlOutput = amlDoc.toString();
if (isFragment) {
// Rip off the AmlDocument and AmlPage tags
int amlPageStartIndex = amlOutput.indexOf(AML_PAGE_START_TAG);
if(amlPageStartIndex != -1){
amlOutput = amlOutput.substring(amlPageStartIndex + AML_PAGE_START_TAG_LENGTH);
int amlPageEndIndex = amlOutput.lastIndexOf(AML_PAGE_END_TAG);
amlOutput = amlOutput.substring(0, amlPageEndIndex);
}
}
if (logger.isLoggable(Level.FINEST)) {
logger.finest("Transformed HTML to AML [" + amlOutput + "]");
}
return amlOutput;
}
private void transform() throws HtmlConversionException {
if (isTransformed) {
return;
}
// Assume regular HTML parser for now
// Will add XHTML handling in later
ParserState state = new ParserState(output, encoder);
GenericHtmlParserCallback genericCallback = new GenericHtmlParserCallback(state);
// Check documentType and proceed.
if (documentType == DOCUMENT_TYPE_HTML) {
StringReader inputReader = new StringReader(input);
HtmlParserCallback callback = new HtmlParserCallback(genericCallback);
try {
new ParserDelegator().parse(inputReader, callback, true);
} catch (Exception e) {
// e.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", e);
throw new HtmlConversionException(HtmlConversionException.TRANSFORMATION_ERROR, e);
}
} else // documentType == DOCUMENT_TYPE_XHTML
{
XhtmlParserCallback callback = new XhtmlParserCallback(genericCallback);
XMLReader parser = null;
try {
parser = XMLReaderFactory.createXMLReader();
} catch (SAXException saxEx) {
// saxEx.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", saxEx);
throw new HtmlConversionException(HtmlConversionException.XML_ERROR, saxEx);
}
parser.setContentHandler(callback);
parser.setDTDHandler(callback);
parser.setEntityResolver(callback);
parser.setErrorHandler(callback);
try {
parser.setFeature(FEATURE_VALIDATION, false);
parser.setFeature(FEATURE_LOAD_EXTERNAL_DTD, false);
} catch (SAXNotRecognizedException saxEx) {
// Ignore these exceptions, and attempt to
// continue processing
// saxEx.printStackTrace();
logger.log(Level.WARNING, "Error converting HTML", saxEx);
} catch (SAXNotSupportedException saxEx) {
// Ignore these exceptions, and attempt to
// continue processing
// saxEx.printStackTrace();
logger.log(Level.WARNING, "Error converting HTML", saxEx);
}
InputSource inputSource = new InputSource(new StringReader(input));
try {
parser.parse(inputSource);
} catch (SAXException saxEx) {
// saxEx.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", saxEx);
throw new HtmlConversionException(HtmlConversionException.TRANSFORMATION_ERROR, saxEx);
} catch (IOException ioEx) {
// ioEx.printStackTrace();
logger.log(Level.SEVERE, "Error converting HTML", ioEx);
throw new HtmlConversionException(HtmlConversionException.TRANSFORMATION_ERROR, ioEx);
}
}
// Now flatten the tables and reform document structure
state.getLayoutManager().reformLayout();
isTransformed = true;
}
}
|