org.archive.url.LaxURLCodec.java Source code

Introduction

Here is the source code for org.archive.url.LaxURLCodec.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.url;

import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;
import java.util.BitSet;

import org.apache.commons.codec.net.URLCodec;

import com.google.common.base.Charsets;

/**
 * @author gojomo
 */
public class LaxURLCodec extends URLCodec {
    public static LaxURLCodec DEFAULT = new LaxURLCodec("UTF-8");

    // passthrough constructor
    public LaxURLCodec(String encoding) {
        super(encoding);
    }

    /**
     * Decodes an array of URL safe 7-bit characters into an array of 
     * original bytes. Escaped characters are converted back to their 
     * original representation.
     * 
     * Differs from URLCodec.decodeUrl() in that it throws no 
     * exceptions; bad or incomplete escape sequences are ignored
     * and passed into result undecoded. This matches the behavior
     * of browsers, which will use inconsistently-encoded URIs
     * in HTTP request-lines. 
     *
     * @param bytes array of URL safe characters
     * @return array of original bytes 
     */
    public static final byte[] decodeUrlLoose(byte[] bytes) {
        if (bytes == null) {
            return null;
        }
        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
        for (int i = 0; i < bytes.length; i++) {
            int b = bytes[i];
            if (b == '+') {
                buffer.write(' ');
                continue;
            }
            if (b == '%') {
                if (i + 2 < bytes.length) {
                    int u = Character.digit((char) bytes[i + 1], 16);
                    int l = Character.digit((char) bytes[i + 2], 16);
                    if (u > -1 && l > -1) {
                        // good encoding
                        int c = ((u << 4) + l);
                        buffer.write((char) c);
                        i += 2;
                        continue;
                    } // else: bad encoding digits, leave '%' in place
                } // else: insufficient encoding digits, leave '%' in place
            }
            buffer.write(b);
        }
        return buffer.toByteArray();
    }

    /**
     * A more expansive set of ASCII URI characters to consider as 'safe' to
     * leave unencoded, based on actual browser behavior.
     */
    public static BitSet EXPANDED_URI_SAFE = new BitSet(256);
    static {
        // alpha characters
        for (int i = 'a'; i <= 'z'; i++) {
            EXPANDED_URI_SAFE.set(i);
        }
        for (int i = 'A'; i <= 'Z'; i++) {
            EXPANDED_URI_SAFE.set(i);
        }
        // numeric characters
        for (int i = '0'; i <= '9'; i++) {
            EXPANDED_URI_SAFE.set(i);
        }
        // special chars
        EXPANDED_URI_SAFE.set('-');
        EXPANDED_URI_SAFE.set('~');
        EXPANDED_URI_SAFE.set('_');
        EXPANDED_URI_SAFE.set('.');
        EXPANDED_URI_SAFE.set('*');
        EXPANDED_URI_SAFE.set('/');
        EXPANDED_URI_SAFE.set('=');
        EXPANDED_URI_SAFE.set('&');
        EXPANDED_URI_SAFE.set('+');
        EXPANDED_URI_SAFE.set(',');
        EXPANDED_URI_SAFE.set(':');
        EXPANDED_URI_SAFE.set(';');
        EXPANDED_URI_SAFE.set('@');
        EXPANDED_URI_SAFE.set('$');
        EXPANDED_URI_SAFE.set('!');
        EXPANDED_URI_SAFE.set(')');
        EXPANDED_URI_SAFE.set('(');
        // experiments indicate: Firefox (1.0.6) never escapes '%'
        EXPANDED_URI_SAFE.set('%');
        // experiments indicate: Firefox (1.0.6) does not escape '|' or '''
        EXPANDED_URI_SAFE.set('|');
        EXPANDED_URI_SAFE.set('\'');
    }

    public static BitSet QUERY_SAFE = new BitSet(256);
    static {
        QUERY_SAFE.or(EXPANDED_URI_SAFE);
        // Tests indicate Firefox (1.0.7-1) doesn't escape curlies in query str.
        QUERY_SAFE.set('{');
        QUERY_SAFE.set('}');
        // nor any of these: [ ] ^ ? 
        QUERY_SAFE.set('[');
        QUERY_SAFE.set(']');
        QUERY_SAFE.set('^');
        QUERY_SAFE.set('?');
    }

    /**
     * Encodes a string into its URL safe form using the specified
     * string charset. Unsafe characters are escaped.
     * 
     * This method is analogous to superclass encode() methods,
     * additionally offering the ability to specify a different
     * 'safe' character set (such as EXPANDED_URI_SAFE). 
     * 
     * @param safe BitSet of characters that don't need to be encoded
     * @param pString String to encode
     * @param cs Name of character set to use
     * @return Encoded version of <code>pString</code>.
     * @throws UnsupportedEncodingException
     */
    public String encode(BitSet safe, String pString, String cs) throws UnsupportedEncodingException {
        if (pString == null) {
            return null;
        }
        return new String(encodeUrl(safe, pString.getBytes(cs)), Charsets.US_ASCII);
    }
}