groovyx.net.http.ParserRegistry.java Source code

Introduction

Here is the source code for groovyx.net.http.ParserRegistry.java
Source

/*
 * Copyright 2008-2011 Thomas Nichols.  http://blog.thomnichols.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * You are receiving this code free of charge, which represents many hours of
 * effort from other individuals and corporations.  As a responsible member
 * of the community, you are encouraged (but not required) to donate any
 * enhancements or improvements back to the community under a similar open
 * source license.  Thank you. -TMN
 */
package groovyx.net.http;

import groovy.json.JsonSlurper;
import groovy.lang.Closure;
import groovy.util.XmlSlurper;
import groovy.util.slurpersupport.GPathResult;
import groovyx.net.http.HTTPBuilder.RequestConfigDelegate;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.message.BasicHeader;
import org.apache.xml.resolver.Catalog;
import org.apache.xml.resolver.CatalogManager;
import org.apache.xml.resolver.tools.CatalogResolver;
import org.codehaus.groovy.runtime.MethodClosure;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

/**
 * <p>Keeps track of response parsers for each content type.  Each parser
 * should should be a closure that accepts an {@link HttpResponse} instance,
 * and returns whatever handler is appropriate for reading the response
 * data for that content-type.  For example, a plain-text response should
 * probably be parsed with a <code>Reader</code>, while an XML response
 * might be parsed by an XmlSlurper, which would then be passed to the
 * response closure. </p>
 *
 * <p>Note that all methods in this class assume {@link HttpResponse#getEntity()}
 * return a non-null value.  It is the job of the HTTPBuilder instance to ensure
 * a NullPointerException is not thrown by passing a response that contains no
 * entity.</p>
 *
 * <p>You can see the list of content-type parsers that are built-in to the
 * ParserRegistry class in {@link #buildDefaultParserMap()}.</p>
 *
 * @see ContentType
 * @author <a href='mailto:tomstrummer+httpbuilder@gmail.com'>Tom Nichols</a>
 */
public class ParserRegistry {

    /**
     * The default parser used for unregistered content-types.  This is a copy
     * of {@link #parseStream(HttpResponse)}, which is like a no-op that just
     * returns the unaltered response stream.
     */
    protected final Closure DEFAULT_PARSER = new MethodClosure(this, "parseStream");
    /**
     * The default charset to use when no charset is given in the Content-Type
     * header of a response.  This can be modifid via {@link #setDefaultCharset(String)}.
     */
    public static final String DEFAULT_CHARSET = "UTF-8";

    private Closure defaultParser = DEFAULT_PARSER;
    private Map<String, Closure> registeredParsers = buildDefaultParserMap();
    private static String defaultCharset = DEFAULT_CHARSET;

    protected static final Log log = LogFactory.getLog(ParserRegistry.class);

    /**
     * This CatalogResolver is static to avoid the overhead of re-parsing
     * the catalog definition file every time.  Unfortunately, there's no
     * way to share a single Catalog instance between resolvers.  The
     * {@link Catalog} class is technically not thread-safe, but as long as you
     * do not parse catalog files while using the resolver, it should be fine.
     */
    protected static CatalogResolver catalogResolver;

    static {
        CatalogManager catalogManager = new CatalogManager();
        catalogManager.setIgnoreMissingProperties(true);
        catalogManager.setUseStaticCatalog(false);
        catalogManager.setRelativeCatalogs(true);
        try {
            catalogResolver = new CatalogResolver(catalogManager);
            catalogResolver.getCatalog().parseCatalog(ParserRegistry.class.getResource("/catalog/html.xml"));
        } catch (IOException ex) {
            LogFactory.getLog(ParserRegistry.class).warn("Could not resolve default XML catalog", ex);
        }
    }

    /**
     * Set the charset to use for parsing character streams when no charset
     * is given in the Content-Type header.
     * @param charset the charset to use, or <code>null</code> to use
     *     {@link #DEFAULT_CHARSET}
     */
    public static void setDefaultCharset(String charset) {
        defaultCharset = charset == null ? DEFAULT_CHARSET : charset;
    }

    /**
     * Helper method to get the charset from the response.  This should be done
     * when manually parsing any text response to ensure it is decoded using the
     * correct charset. For instance:<pre>
     * Reader reader = new InputStreamReader( resp.getEntity().getContent(),
     *   ParserRegistry.getCharset( resp ) );</pre>
     * @param resp
     */
    public static String getCharset(HttpResponse resp) {
        try {
            NameValuePair charset = resp.getEntity().getContentType().getElements()[0]
                    .getParameterByName("charset");

            if (charset == null || charset.getValue().trim().equals("")) {
                log.debug("Could not find charset in response; using " + defaultCharset);
                return defaultCharset;
            }

            return charset.getValue();
        } catch (RuntimeException ex) { // NPE or OOB Exceptions
            log.warn("Could not parse charset from content-type header in response");
            return Charset.defaultCharset().name();
        }
    }

    /**
     * Helper method to get the content-type string from the response
     * (no charset).
     * @param resp
     */
    public static String getContentType(HttpResponse resp) {
        if (resp.getEntity() == null)
            throw new IllegalArgumentException("Response does not contain data");
        if (resp.getEntity().getContentType() == null)
            throw new IllegalArgumentException("Response does not have a content-type header");
        try {
            return resp.getEntity().getContentType().getElements()[0].getName();
        } catch (RuntimeException ex) { // NPE or OOB Exceptions
            throw new IllegalArgumentException("Could not parse content-type from response");
        }
    }

    /**
     * Default parser used for binary data.  This simply returns the underlying
     * response InputStream.
     * @see ContentType#BINARY
     * @see HttpEntity#getContent()
     * @param resp
     * @return an InputStream the binary response stream
     * @throws IllegalStateException
     * @throws IOException
     */
    public InputStream parseStream(HttpResponse resp) throws IOException {
        return resp.getEntity().getContent();
    }

    /**
     * Default parser used to handle plain text data.  The response text
     * is decoded using the charset passed in the response content-type
     * header.
     * @see ContentType#TEXT
     * @param resp
     * @return
     * @throws UnsupportedEncodingException
     * @throws IllegalStateException
     * @throws IOException
     */
    public Reader parseText(HttpResponse resp) throws IOException {
        return new InputStreamReader(resp.getEntity().getContent(), ParserRegistry.getCharset(resp));
    }

    /**
     * Default parser used to decode a URL-encoded response.
     * @see ContentType#URLENC
     * @param resp
     * @return
     * @throws IOException
     */
    public Map<String, String> parseForm(final HttpResponse resp) throws IOException {
        HttpEntity entity = resp.getEntity();
        /* URLEncodedUtils won't parse the content unless the content-type is
           application/x-www-form-urlencoded.  Since we want to be able to force
           parsing regardless of what the content-type header says, we need to
           'spoof' the content-type if it's not already acceptable. */
        if (!ContentType.URLENC.toString().equals(ParserRegistry.getContentType(resp))) {
            entity = new HttpEntityWrapper(entity) {
                @Override
                public org.apache.http.Header getContentType() {
                    String value = ContentType.URLENC.toString();
                    String charset = ParserRegistry.getCharset(resp);
                    if (charset != null)
                        value += "; charset=" + charset;
                    return new BasicHeader("Content-Type", value);
                };
            };
        }
        List<NameValuePair> params = URLEncodedUtils.parse(entity);
        Map<String, String> paramMap = new HashMap<String, String>(params.size());
        for (NameValuePair param : params)
            paramMap.put(param.getName(), param.getValue());
        return paramMap;
    }

    /**
     * Parse an HTML document by passing it through the NekoHTML parser.
     * @see ContentType#HTML
     * @see org.cyberneko.html.parsers.SAXParser
     * @see XmlSlurper#parse(Reader)
     * @param resp HTTP response from which to parse content
     * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
     * @throws IOException
     * @throws SAXException
     */
    public GPathResult parseHTML(HttpResponse resp) throws IOException, SAXException {
        XMLReader p = new org.cyberneko.html.parsers.SAXParser();
        p.setEntityResolver(catalogResolver);
        return new XmlSlurper(p).parse(parseText(resp));
    }

    /**
     * Default parser used to decode an XML response.
     * @see ContentType#XML
     * @see XmlSlurper#parse(Reader)
     * @param resp HTTP response from which to parse content
     * @return the {@link GPathResult} from calling {@link XmlSlurper#parse(Reader)}
     * @throws IOException
     * @throws SAXException
     * @throws ParserConfigurationException
     */
    public GPathResult parseXML(HttpResponse resp) throws IOException, SAXException, ParserConfigurationException {
        XmlSlurper xml = new XmlSlurper();
        xml.setEntityResolver(catalogResolver);
        return xml.parse(parseText(resp));
    }

    /**
     * Default parser used to decode a JSON response.
     * @see ContentType#JSON
     * @param resp
     * @return
     * @throws IOException
     */
    public Object parseJSON(HttpResponse resp) throws IOException {
        // there is a bug in the JsonSlurper.parse method...
        //String jsonTxt = DefaultGroovyMethods.getText( parseText( resp ) );
        return new JsonSlurper().parse(parseText(resp));
    }

    /**
     * <p>Returns a map of default parsers.  Override this method to change
     * what parsers are registered by default.  A 'parser' is really just a
     * closure that acceipts an {@link HttpResponse} instance and returns
     * some parsed data.  You can of course call
     * <code>super.buildDefaultParserMap()</code> and then add or remove
     * from that result as well.</p>
     *
     * <p>Default registered parsers are:
     * <ul>
     * <li>{@link ContentType#BINARY} :  {@link #parseStream(HttpResponse) parseStream()}</li>
     * <li>{@link ContentType#TEXT} :  {@link #parseText(HttpResponse) parseText()}</li>
     * <li>{@link ContentType#URLENC} :  {@link #parseForm(HttpResponse) parseForm()}</li>
     * <li>{@link ContentType#XML} :  {@link #parseXML(HttpResponse) parseXML()}</li>
     * <li>{@link ContentType#JSON} :  {@link #parseJSON(HttpResponse) parseJSON()}</li>
     * </ul>
     */
    protected Map<String, Closure> buildDefaultParserMap() {
        Map<String, Closure> parsers = new HashMap<String, Closure>();

        parsers.put(ContentType.BINARY.toString(), new MethodClosure(this, "parseStream"));
        parsers.put(ContentType.TEXT.toString(), new MethodClosure(this, "parseText"));
        parsers.put(ContentType.URLENC.toString(), new MethodClosure(this, "parseForm"));
        parsers.put(ContentType.HTML.toString(), new MethodClosure(this, "parseHTML"));

        Closure pClosure = new MethodClosure(this, "parseXML");
        for (String ct : ContentType.XML.getContentTypeStrings())
            parsers.put(ct, pClosure);

        pClosure = new MethodClosure(this, "parseJSON");
        for (String ct : ContentType.JSON.getContentTypeStrings())
            parsers.put(ct, pClosure);

        return parsers;
    }

    /**
     * Add a new XML catalog definiton to the static XML resolver catalog.
     * See the <a href='http://fisheye.codehaus.org/browse/gmod/httpbuilder/trunk/src/main/resources/catalog/html.xml?r=root:'>
     * HTTPBuilder source catalog</a> for an example.
     *
     * @param catalogLocation URL of a catalog definition file
     * @throws IOException if the given URL cannot be parsed or accessed for whatever reason.
     */
    public static void addCatalog(URL catalogLocation) throws IOException {
        catalogResolver.getCatalog().parseCatalog(catalogLocation);
    }

    /**
     * Access the default catalog used by all HTTPBuilder instances.
     * @return the static {@link CatalogResolver} instance
     */
    public static CatalogResolver getCatalogResolver() {
        return catalogResolver;
    }

    /**
     * Get the default parser used for unregistered content-types.
     * @return
     */
    public Closure getDefaultParser() {
        return this.defaultParser;
    }

    /**
     * Set the default parser used for unregistered content-types.
     * @param defaultParser if
     */
    public void setDefaultParser(Closure defaultParser) {
        if (defaultParser == null)
            this.defaultParser = DEFAULT_PARSER;
        this.defaultParser = defaultParser;
    }

    /**
     * Retrieve a parser for the given response content-type string.  This
     * is called by HTTPBuildre to retrieve the correct parser for a given
     * content-type.  The parser is then used to decode the response data prior
     * to passing it to a response handler.
     * @param contentType
     * @return parser that can interpret the given response content type,
     *   or the default parser if no parser is registered for the given
     *   content-type.  It should NOT return a null value.
     */
    public Closure getAt(Object contentType) {
        String ct = contentType.toString();
        int idx = ct.indexOf(';');
        if (idx > 0)
            ct = ct.substring(0, idx);

        Closure parser = registeredParsers.get(ct);
        if (parser != null)
            return parser;

        log.warn("Cannot find parser for content-type: " + ct + " -- using default parser.");
        return defaultParser;
    }

    /**
     * Register a new parser for the given content-type.  The parser closure
     * should accept an {@link HttpResponse} argument and return a type suitable
     * to be passed as the 'parsed data' argument of a
     * {@link RequestConfigDelegate#getResponse() response handler} closure.
     * @param contentType  <code>content-type</code> string
     * @param value code that will parse the HttpResponse and return parsed
     *   data to the response handler.
     */
    public void putAt(Object contentType, Closure value) {
        if (contentType instanceof ContentType) {
            for (String ct : ((ContentType) contentType).getContentTypeStrings())
                this.registeredParsers.put(ct, value);
        } else
            this.registeredParsers.put(contentType.toString(), value);
    }

    /**
     * Alias for {@link #getAt(Object)} to allow property-style access.
     * @param key content-type string
     * @return
     */
    public Closure propertyMissing(Object key) {
        return this.getAt(key);
    }

    /**
     * Alias for {@link #putAt(Object, Closure)} to allow property-style access.
     * @param key content-type string
     * @param value parser closure
     */
    public void propertyMissing(Object key, Closure value) {
        this.putAt(key, value);
    }

    /**
     * Iterate over the entire parser map
     * @return
     */
    public Iterator<Map.Entry<String, Closure>> iterator() {
        return this.registeredParsers.entrySet().iterator();
    }
}