org.zaproxy.zap.spider.URLCanonicalizer.java Source code

Java tutorial

Introduction

Here is the source code for org.zaproxy.zap.spider.URLCanonicalizer.java

Source

/*
 * Zed Attack Proxy (ZAP) and its related class files.
 * 
 * ZAP is an HTTP/HTTPS proxy for assessing web application security.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0 
 *   
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 * 
 * ZAP: Based on work by Yasser Ganjisaffar <lastname at gmail dot com> 
 * from project http://code.google.com/p/crawler4j/
 */

package org.zaproxy.zap.spider;

import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.log4j.Logger;
import org.zaproxy.zap.spider.SpiderParam.HandleParametersOption;

/**
 * The URLCanonicalizer is used for the process of converting an URL into a canonical (normalized) form. See
 * <a href="http://en.wikipedia.org/wiki/URL_normalization">URL Normalization</a> for a reference. <br/>
 * <br/>
 * 
 * Note: some parts of the code are adapted from: <a
 * href="http://stackoverflow.com/a/4057470/405418">stackoverflow</a>
 * 
 * Added support for OData URLs
 */
public final class URLCanonicalizer {

    /** The Constant log. */
    private static final Logger log = Logger.getLogger(URLCanonicalizer.class);

    /** The Constant IRRELEVANT_PARAMETERS defining the parameter names which are ignored in the URL. */
    private static final Set<String> IRRELEVANT_PARAMETERS = new HashSet<>(3);
    static {
        IRRELEVANT_PARAMETERS.add("jsessionid");
        IRRELEVANT_PARAMETERS.add("phpsessid");
        IRRELEVANT_PARAMETERS.add("aspsessionid");
    }

    /** 
     *   OData support
     *   Extract the ID of a resource including the surrounding quote
     *  First group is the resource_name
     *  Second group is the ID (quote will be taken as part of the value)
     */
    private static final Pattern patternResourceIdentifierUnquoted = Pattern.compile("/([\\w%]*)\\(([\\w']*)\\)");

    /** 
     * OData support
     * Detect a section containing a composite IDs 
     */
    private static final Pattern patternResourceMultipleIdentifier = Pattern.compile("/[\\w%]*\\((.*)\\)");

    /** 
     * OData support
     * Extract the detail of the multiples IDs
     */
    private static final Pattern patternResourceMultipleIdentifierDetail = Pattern.compile("([\\w%]*)=([\\w']*)");

    /**
     * Private constructor to avoid initialization of object.
     */
    private URLCanonicalizer() {
    }

    /**
     * Gets the canonical url.
     * 
     * @param url the url
     * @return the canonical url
     */
    public static String getCanonicalURL(String url) {
        return getCanonicalURL(url, null);
    }

    /**
     * Gets the canonical url, starting from a relative or absolute url found in a given context (baseURL).
     * 
     * @param url the url string defining the reference
     * @param baseURL the context in which this url was found
     * @return the canonical url
     */
    public static String getCanonicalURL(String url, String baseURL) {

        try {
            /* Build the absolute URL, from the url and the baseURL */
            String resolvedURL = URLResolver.resolveUrl(baseURL == null ? "" : baseURL, url);
            log.debug("Resolved URL: " + resolvedURL);
            URI canonicalURI;
            try {
                canonicalURI = new URI(resolvedURL);
            } catch (Exception e) {
                canonicalURI = new URI(URIUtil.encodeQuery(resolvedURL));
            }

            /* Some checking. */
            if (canonicalURI.getScheme() == null) {
                throw new MalformedURLException("Protocol could not be reliably evaluated from uri: " + canonicalURI
                        + " and base url: " + baseURL);
            }

            if (canonicalURI.getRawAuthority() == null) {
                log.debug("Ignoring URI with no authority (host[\":\"port]): " + canonicalURI);
                return null;
            }

            if (canonicalURI.getHost() == null) {
                throw new MalformedURLException("Host could not be reliably evaluated from: " + canonicalURI);
            }

            /*
             * Normalize: no empty segments (i.e., "//"), no segments equal to ".", and no segments equal to
             * ".." that are preceded by a segment not equal to "..".
             */
            String path = canonicalURI.normalize().getRawPath();

            /* Convert '//' -> '/' */
            int idx = path.indexOf("//");
            while (idx >= 0) {
                path = path.replace("//", "/");
                idx = path.indexOf("//");
            }

            /* Drop starting '/../' */
            while (path.startsWith("/../")) {
                path = path.substring(3);
            }

            /* Trim */
            path = path.trim();

            /* Process parameters and sort them. */
            final SortedMap<String, String> params = createParameterMap(canonicalURI.getRawQuery());
            final String queryString;
            String canonicalParams = canonicalize(params);
            queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);

            /* Add starting slash if needed */
            if (path.length() == 0) {
                path = "/" + path;
            }

            /* Drop default port: example.com:80 -> example.com */
            int port = canonicalURI.getPort();
            if (port == 80) {
                port = -1;
            }

            /* Lowercasing protocol and host */
            String protocol = canonicalURI.getScheme().toLowerCase();
            String host = canonicalURI.getHost().toLowerCase();
            String pathAndQueryString = normalizePath(path) + queryString;

            URL result = new URL(protocol, host, port, pathAndQueryString);
            return result.toExternalForm();

        } catch (Exception ex) {
            log.warn("Error while Processing URL in the spidering process (on base " + baseURL + "): "
                    + ex.getMessage());
            return null;
        }
    }

    /**
     * Builds a String representation of the URI with cleaned parameters, that can be used when checking if an
     * URI was already visited. The URI provided as a parameter should be already cleaned and canonicalized,
     * so it should be build with a result from {@link #getCanonicalURL(String)}.
     * 
     * <p>
     * When building the URI representation, the same format should be used for all the cases, as it may
     * affect the number of times the pages are visited and reported if the option HandleParametersOption is
     * changed while the spider is running.
     * </p>
     * 
     * @param uri the uri
     * @param handleParameters the handle parameters option
     * @param handleODataParametersVisited Should we handle specific OData parameters
     * @return the string representation of the URI
     * @throws URIException the URI exception
     */
    public static String buildCleanedParametersURIRepresentation(org.apache.commons.httpclient.URI uri,
            SpiderParam.HandleParametersOption handleParameters, boolean handleODataParametersVisited)
            throws URIException {
        // If the option is set to use all the information, just use the default string representation
        if (handleParameters.equals(HandleParametersOption.USE_ALL)) {
            return uri.toString();
        }

        // If the option is set to ignore parameters completely, ignore the query completely
        if (handleParameters.equals(HandleParametersOption.IGNORE_COMPLETELY)) {
            return createBaseUriWithCleanedPath(uri, handleParameters, handleODataParametersVisited);
        }

        // If the option is set to ignore the value, we get the parameters and we only add their name to the
        // query
        if (handleParameters.equals(HandleParametersOption.IGNORE_VALUE)) {
            StringBuilder retVal = new StringBuilder(
                    createBaseUriWithCleanedPath(uri, handleParameters, handleODataParametersVisited));

            String cleanedQuery = getCleanedQuery(uri.getEscapedQuery());

            // Add the parameters' names to the uri representation. 
            if (cleanedQuery.length() > 0) {
                retVal.append('?').append(cleanedQuery);
            }

            return retVal.toString();
        }

        // Should not be reached
        return uri.toString();
    }

    private static String createBaseUriWithCleanedPath(org.apache.commons.httpclient.URI uri,
            HandleParametersOption handleParameters, boolean handleODataParametersVisited) throws URIException {
        StringBuilder uriBuilder = new StringBuilder(createBaseUri(uri));

        uriBuilder.append(getCleanedPath(uri.getEscapedPath(), handleParameters, handleODataParametersVisited));

        return uriBuilder.toString();
    }

    private static String createBaseUri(org.apache.commons.httpclient.URI uri) throws URIException {
        StringBuilder baseUriBuilder = new StringBuilder();
        baseUriBuilder.append(uri.getScheme()).append("://").append(uri.getHost());
        if (uri.getPort() != -1) {
            baseUriBuilder.append(':').append(uri.getPort());
        }
        return baseUriBuilder.toString();
    }

    private static String getCleanedPath(String escapedPath, HandleParametersOption handleParameters,
            boolean handleODataParametersVisited) {
        if (escapedPath == null) {
            return "";
        }

        String cleanedPath;
        if (handleODataParametersVisited) {
            cleanedPath = cleanODataPath(escapedPath, handleParameters);
        } else {
            cleanedPath = escapedPath;
        }

        return cleanedPath;
    }

    private static String getCleanedQuery(String escapedQuery) {
        // Get the parameters' names
        SortedMap<String, String> params = createParameterMap(escapedQuery);
        StringBuilder cleanedQueryBuilder = new StringBuilder();
        if (params != null && !params.isEmpty()) {
            for (String key : params.keySet()) {
                // Ignore irrelevant parameters
                if (IRRELEVANT_PARAMETERS.contains(key) || key.startsWith("utm_")) {
                    continue;
                }
                if (cleanedQueryBuilder.length() > 0) {
                    cleanedQueryBuilder.append('&');
                }
                cleanedQueryBuilder.append(key);
            }
        }

        return cleanedQueryBuilder.toString();
    }

    /**
     * Clean the path in the case of an OData Uri containing a resource identifier (simple or multiple)
     * 
     * @param path The path to clean
     * @param handleParameters tThe cleaning mode
     * @return A cleaned path
     */
    private static String cleanODataPath(String path, HandleParametersOption handleParameters) {
        String cleanedPath = path;

        if (HandleParametersOption.USE_ALL.equals(handleParameters)) {
            cleanedPath = path;
        } else {

            // check for single ID (unnamed)
            Matcher matcher = patternResourceIdentifierUnquoted.matcher(path);
            if (matcher.find()) {
                String resourceName = matcher.group(1);
                String resourceID = matcher.group(2);

                String subString = resourceName + "(" + resourceID + ")";
                int begin = path.indexOf(subString);
                int end = begin + subString.length();

                String beforeSubstring = path.substring(0, begin);
                String afterSubstring = path.substring(end);

                if (HandleParametersOption.IGNORE_COMPLETELY.equals(handleParameters)
                        || HandleParametersOption.IGNORE_VALUE.equals(handleParameters)) {

                    StringBuilder sb = new StringBuilder(beforeSubstring);
                    sb.append(resourceName).append("()").append(afterSubstring);
                    cleanedPath = sb.toString();
                }

            } else {

                matcher = patternResourceMultipleIdentifier.matcher(path);
                if (matcher.find()) {
                    // We've found a composite identifier. i.e: /Resource(field1=a,field2=3)

                    String multipleIdentifierSection = matcher.group(1);

                    int begin = path.indexOf(multipleIdentifierSection);
                    int end = begin + multipleIdentifierSection.length();

                    String beforeSubstring = path.substring(0, begin);
                    String afterSubstring = path.substring(end);

                    if (HandleParametersOption.IGNORE_COMPLETELY.equals(handleParameters)) {
                        cleanedPath = beforeSubstring + afterSubstring;
                    } else {
                        StringBuilder sb = new StringBuilder(beforeSubstring);

                        matcher = patternResourceMultipleIdentifierDetail.matcher(multipleIdentifierSection);
                        int i = 1;
                        while (matcher.find()) {

                            if (i > 1) {
                                sb.append(',');
                            }
                            String paramName = matcher.group(1);
                            sb.append(paramName);
                            i++;
                        }

                        sb.append(afterSubstring);
                        cleanedPath = sb.toString();
                    }

                }
            }
        }

        return cleanedPath;
    }

    /**
     * Takes a query string, separates the constituent name-value pairs, and stores them in a SortedMap
     * ordered by lexicographical order.
     * 
     * @param queryString the query string
     * @return Null if there is no query string.
     */
    private static SortedMap<String, String> createParameterMap(final String queryString) {
        if (queryString == null || queryString.isEmpty()) {
            return null;
        }

        final String[] pairs = queryString.split("&");
        final SortedMap<String, String> params = new TreeMap<>();

        for (final String pair : pairs) {
            if (pair.length() == 0) {
                continue;
            }

            String[] tokens = pair.split("=", 2);
            switch (tokens.length) {
            case 1:
                if (pair.charAt(0) == '=') {
                    params.put("", tokens[0]);
                } else {
                    params.put(tokens[0], "");
                }
                break;
            case 2:
                params.put(tokens[0], tokens[1]);
                break;
            }
        }
        return params;
    }

    /**
     * Canonicalize the query string.
     * 
     * @param sortedParamMap Parameter name-value pairs in lexicographical order.
     * @return Canonical form of query string.
     */
    private static String canonicalize(final SortedMap<String, String> sortedParamMap) {
        if (sortedParamMap == null || sortedParamMap.isEmpty()) {
            return "";
        }

        final StringBuilder sb = new StringBuilder(100);
        for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) {
            final String key = pair.getKey().toLowerCase();
            // Ignore irrelevant parameters
            if (IRRELEVANT_PARAMETERS.contains(key) || key.startsWith("utm_")) {
                continue;
            }
            if (sb.length() > 0) {
                sb.append('&');
            }
            sb.append(pair.getKey());
            if (!pair.getValue().isEmpty()) {
                sb.append('=');
                sb.append(pair.getValue());
            }
        }
        return sb.toString();
    }

    /**
     * Normalize path.
     * 
     * @param path the path
     * @return the string
     */
    private static String normalizePath(final String path) {
        return path.replace("%7E", "~").replace(" ", "%20");
    }

}