org.archive.wayback.util.url.UrlOperations.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.wayback.util.url.UrlOperations.java

Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.util.url;

import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URIException;
import org.archive.url.UsableURI;
import org.archive.url.UsableURIFactory;
import org.archive.wayback.archivalurl.ArchivalUrl;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.webapp.AccessPoint;

/**
 * Class containing common static URL methods. Primarily resolveUrl() and 
 * the (currently) unused isAuthority().
 *
 * @author brad
 * @version $Date$, $Revision$
 */
public class UrlOperations {
    private static final Logger LOGGER = Logger.getLogger(UrlOperations.class.getName());

    /**
     * ARC/WARC specific DNS resolution record.
     */
    public final static String DNS_SCHEME = "dns:";
    /**
     * HTTP
     */
    public final static String HTTP_SCHEME = "http://";
    /**
     * HTTPS
     */
    public final static String HTTPS_SCHEME = "https://";
    /**
     * FTP
     */
    public final static String FTP_SCHEME = "ftp://";
    /**
     * MMS
     */
    public final static String MMS_SCHEME = "mms://";
    /**
     * RTSP
     */
    public final static String RTSP_SCHEME = "rtsp://";

    /**
     * Default scheme to assume if unspecified. No context implied...
     */
    public final static String DEFAULT_SCHEME = HTTP_SCHEME;

    /**
     * go brewster
     */
    public final static String WAIS_SCHEME = "wais://";

    /**
     * array of static Strings for all "known" schemes
     */
    public final static String ALL_SCHEMES[] = { HTTP_SCHEME, HTTPS_SCHEME, FTP_SCHEME, MMS_SCHEME, RTSP_SCHEME,
            WAIS_SCHEME };

    /**
     * character separating host from port within a URL authority
     */
    public final static char PORT_SEPARATOR = ':';
    /**
     * character which delimits the path from the authority in a... in some 
     * URLs.
     */
    public final static char PATH_START = '/';

    private static final String ALL_TLDS = "ac|ad|ae|aero|af|ag|ai|al|am|an"
            + "|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi"
            + "|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci"
            + "|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec"
            + "|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh"
            + "|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id"
            + "|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh"
            + "|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc"
            + "|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum"
            + "|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz"
            + "|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro"
            + "|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv"
            + "|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv"
            + "|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d"
            + "|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad"
            + "|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp"
            + "|xn--kgbechtv|xn--mgbaam7a8h|xn--mgberp4a5d4ar|xn--p1ai" + "|xn--wgbh1c|xn--zckzah|ye|yt|za|zm|zw";

    private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+";

    private static final Pattern AUTHORITY_REGEX = Pattern
            .compile("(([0-9a-z_.-]+)\\.(" + ALL_TLDS + "))|" + "(" + IP_PATTERN + ")");

    //    private static final Pattern AUTHORITY_REGEX_SIMPLE =
    //        Pattern.compile("([0-9a-z_.-]++)");
    private static final Pattern HOST_REGEX_SIMPLE = Pattern.compile("(?:[0-9a-z_.:-]+@)?([0-9a-z_.-]++)");
    private static final Pattern USERINFO_REGEX_SIMPLE = Pattern.compile("^([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)");

    /**
     * Tests if the String argument looks like it could be a legitimate 
     * authority fragment of a URL, that is, is it an IP address, or, are the
     * characters legal in an authority, and does the string end with a legal
     * TLD.
     * 
    * @param authString String representation of a fragment of a URL 
    * @return boolean indicating whether urlPart might be an Authority.
    */
    public static boolean isAuthority(String authString) {
        Matcher m = AUTHORITY_REGEX.matcher(authString);

        return (m != null) && m.matches();
    }

    /** Resolve URL, but return a minimally escaped version in case of
     *  error
     * @param baseUrl the base URL against which the url should be resolved
     * @param url the URL, possibly relative, to make absolute.
     * @return url resolved against baseUrl, unless it is absolute already, and
     * further transformed by whatever escaping normally takes place with a 
     * UsableURI.
     * In case of error, return URL.
     */
    public static String resolveUrl(String baseUrl, String url) {
        String resolvedUrl = resolveUrl(baseUrl, url, null);
        if (resolvedUrl == null) {
            resolvedUrl = url.replace(" ", "%20");
            resolvedUrl = resolvedUrl.replace("\r", "%0D");
        }
        return resolvedUrl;
    }

    /**
     * Resolve a possibly relative url argument against a base URL.
     * @param baseUrl the base URL against which the url should be resolved
     * @param url the URL, possibly relative, to make absolute.
     * @param defaultValue The default value to return if the supplied values can't be resolved.
     * @return url resolved against baseUrl, unless it is absolute already, and
     * further transformed by whatever escaping normally takes place with a 
     * UsableURI.
     * In case of error, return the defaultValue
     */
    public static String resolveUrl(String baseUrl, String url, String defaultValue) {

        for (final String scheme : ALL_SCHEMES) {
            if (url.startsWith(scheme)) {
                try {
                    return UsableURIFactory.getInstance(url).getEscapedURI();
                } catch (URIException e) {
                    LOGGER.warning(e.getLocalizedMessage() + ": " + url);
                    // can't let a space exist... send back close to whatever came
                    // in...
                    return defaultValue;
                }
            }
        }
        UsableURI absBaseURI;
        UsableURI resolvedURI = null;
        try {
            absBaseURI = UsableURIFactory.getInstance(baseUrl);
            resolvedURI = UsableURIFactory.getInstance(absBaseURI, url);
        } catch (URIException e) {
            LOGGER.warning(e.getLocalizedMessage() + ": " + url);
            return defaultValue;
        }

        return resolvedURI.getEscapedURI();
    }

    /**
     * Attempt to find the scheme (http://, https://, etc) from a given URL.
     * @param url URL String to parse for a scheme.
     * @return the scheme, including trailing "://" if known, null otherwise.
     */
    public static String urlToScheme(final String url) {
        for (final String scheme : ALL_SCHEMES) {
            if (url.startsWith(scheme)) {
                return scheme;
            }
        }
        return null;
    }

    /**
     * Return the default port for the scheme String argument, if known.
     * @param scheme String scheme, including '://', as in, "http://", "ftp://"
     * @return the default port for the scheme, or -1 if the scheme isn't known.
     */
    public static int schemeToDefaultPort(final String scheme) {
        if (scheme.equals(HTTP_SCHEME)) {
            return 80;
        }
        if (scheme.equals(HTTPS_SCHEME)) {
            return 443;
        }
        if (scheme.equals(FTP_SCHEME)) {
            return 21;
        }
        if (scheme.equals(RTSP_SCHEME)) {
            return 554;
        }
        if (scheme.equals(MMS_SCHEME)) {
            return 1755;
        }
        return -1;
    }

    /**
     * Attempt to extract the path component of a url String argument.
     * @param url the URL which may contain a path, sans scheme.
     * @return the path component of the URL, or "" if it contains no path.
     */
    public static String getURLPath(String url) {
        url = stripURLScheme(url);
        int pathIdx = url.indexOf(UrlOperations.PATH_START);
        if (pathIdx == -1) {
            return "/";
        }
        return url.substring(pathIdx);
    }

    /**
     * Attempt to extract the path component of a url String argument.
     * @param url the URL which may contain a path, sans scheme.
     * @return the path component of the URL, or "" if it contains no path.
     */
    public static String stripURLScheme(String url) {
        String lcUrl = url.toLowerCase();
        for (String scheme : ALL_SCHEMES) {
            if (lcUrl.startsWith(scheme)) {
                return url.substring(scheme.length());
            }
        }
        return url;
    }

    /**
     * Attempt to strip default ports out of URL strings.
     * @param url the original URL possibly including a port
     * @return the URL sans port, if the scheme was recognized and the default
     * port was supplied, otherwise, the original URL.
     */
    public static String stripDefaultPortFromUrl(String url) {
        String scheme = urlToScheme(url);
        if (scheme == null) {
            return url;
        }
        int defaultPort = schemeToDefaultPort(scheme);
        if (defaultPort == -1) {
            return url;
        }

        String portStr = null;
        // is there a slash after the scheme?
        int slashIdx = url.indexOf('/', scheme.length());
        if (slashIdx == -1) {
            portStr = String.format(":%d", defaultPort);
            if (url.endsWith(portStr)) {
                return url.substring(0, url.length() - portStr.length());
            }
        }
        portStr = String.format(":%d/", defaultPort);
        int idx = url.indexOf(portStr);
        if (idx == -1) {
            return url;
        }
        // if that occurred before the first / (after the scheme) then strip it:
        if (slashIdx < idx) {
            return url;
        }
        // we want to strip out the portStr:
        StringBuilder sb = new StringBuilder(url.length());
        sb.append(url.substring(0, idx));
        sb.append(url.substring(idx + (portStr.length() - 1)));
        return sb.toString();
    }

    /**
     * @param orig String containing a URL, possibly beginning with "http:/".
     * @return original string if orig begins with "http://", or a new String
     * with the extra slash, if orig only had one slash.
     * @see #fixupScheme
     */
    public static String fixupHTTPUrlWithOneSlash(String orig) {
        if (orig.startsWith("http:/") && !orig.startsWith(HTTP_SCHEME)) {
            // very likely the IE "you must have meant 1 slash, not 2 bug:
            StringBuilder sb = new StringBuilder(orig.length() + 1);
            sb.append(HTTP_SCHEME);
            return sb.append(orig.substring(6)).toString();
        }
        return orig;
    }

    /**
     * fixes up malformed scheme part.
     * <p>currently supports fixing missing second slash for protocols
     * {@code http}, {@code https}, {@code ftp}, {@code rtsp} and
     * {@code mms}. For example fixing {@code http:/} to {@code https://}</p>
     * @param url URL to be checked and fixed
     * @return new String, or {@code url} if not fix is required.
     * @since 1.8.1
     */
    public static String fixupScheme(String url) {
        final String[] SCHEMES = { "http:/", "https:/", "ftp:/", "rtsp:/", "mms:/" };
        int ul = url.length();
        for (String scheme : SCHEMES) {
            int sl = scheme.length();
            if (url.startsWith(scheme) && (ul == sl || url.charAt(sl) != '/')) {
                return scheme + "/" + url.substring(sl);
            }
        }
        return url;
    }

    /**
     * Attempt to extract the hostname component of an absolute URL argument.
     * @param url the url String from which to extract the hostname
     * @return the hostname within the URL, or the url argument if the host 
     * cannot be found.
     */
    public static String urlToHost(String url) {
        String lcUrl = url.toLowerCase();
        if (lcUrl.startsWith(DNS_SCHEME)) {
            return lcUrl.substring(DNS_SCHEME.length());
        }
        for (String scheme : ALL_SCHEMES) {
            if (lcUrl.startsWith(scheme)) {
                int authorityIdx = scheme.length();

                Matcher m = HOST_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx));
                if (m.find()) {
                    return m.group(1);
                }
            }
        }
        return url;
    }

    /**
     * Extract userinfo from the absolute URL argument, that is, "username@", or
     * "username:password@" if present.
     * @param url the URL from which to extract the userinfo
     * @return the userinfo found, not including the "@", or null if no userinfo
     * is found
     */
    public static String urlToUserInfo(String url) {
        String lcUrl = url.toLowerCase();
        if (lcUrl.startsWith(DNS_SCHEME)) {
            return null;
        }
        for (String scheme : ALL_SCHEMES) {
            if (lcUrl.startsWith(scheme)) {
                int authorityIdx = scheme.length();

                Matcher m = USERINFO_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx));
                if (m.find()) {
                    return m.group(1);
                }
            }
        }
        return null;
    }

    /**
     * Find and return the parent directory of the URL argument
     * @param url to find the parent directory of
     * @return parent directory of URL, or null, if either the url argument is
     * invalid, or if the url is the root of the authority.
     */
    public static String getUrlParentDir(String url) {

        try {
            UsableURI uri = UsableURIFactory.getInstance(url);
            String path = uri.getPath();
            if (path.length() > 1) {
                int startIdx = path.length() - 1;
                if (path.charAt(path.length() - 1) == '/') {
                    startIdx--;
                }
                int idx = path.lastIndexOf('/', startIdx);
                if (idx >= 0) {
                    uri.setPath(path.substring(0, idx + 1));
                    uri.setQuery(null);
                    return uri.toString();
                }
            }
        } catch (URIException e) {
            LOGGER.warning(e.getLocalizedMessage() + ": " + url);
        }
        return null;
    }

    /**
     * build replay Archival-URL for the same capture as request
     * {@code wbRequest}, with identity-context ({@code id_}) flag on.
     * <p>
     * REFACTOR: move this method to {@link ArchivalUrl}.
     * </p>
     * @param wbRequest requested capture and URL scheme info.
     * @return URL string
     */
    public static String computeIdentityUrl(WaybackRequest wbRequest) {
        AccessPoint accessPoint = wbRequest.getAccessPoint();

        boolean origIdentity = wbRequest.isIdentityContext();
        wbRequest.setIdentityContext(true);

        ArchivalUrl aUrl = new ArchivalUrl(wbRequest);
        String bestPath = aUrl.toString();
        String betterURI = accessPoint.getReplayPrefix() + bestPath;

        //reset the isIdentity flag just in case
        wbRequest.setIdentityContext(origIdentity);

        return betterURI;
    }
}