com.linkedin.urls.HostNormalizer.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.urls.HostNormalizer.java

Source

/**
 * Copyright 2015 LinkedIn Corp. All rights reserved.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License athttp://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *
 */
package com.linkedin.urls;

import com.linkedin.urls.detection.CharUtils;
import java.net.IDN;
import java.net.Inet6Address;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.StringUtils;

/**
 * Normalizes the host by converting hex characters to the actual textual representation, changes ip addresses
 * to a formal format. Then re-encodes the final host name.
 */
public class HostNormalizer {
    private static final long MAX_NUMERIC_DOMAIN_VALUE = 4294967295L;
    private static final int MAX_IPV4_PART = 255;
    private static final int MIN_IP_PART = 0;
    private static final int MAX_IPV6_PART = 0xFFFF;
    private static final int IPV4_MAPPED_IPV6_START_OFFSET = 12;
    private static final int NUMBER_BYTES_IN_IPV4 = 4;

    private byte[] _bytes;
    private String _host;
    private String _normalizedHost;

    public HostNormalizer(String host) {
        _host = host;
        _bytes = null;

        normalizeHost();
    }

    private void normalizeHost() {
        if (StringUtils.isEmpty(_host)) {
            return;
        }

        String host;
        try {
            //replace high unicode characters
            host = IDN.toASCII(_host);
        } catch (IllegalArgumentException ex) {
            //occurs when the url is invalid. Just return
            return;
        }

        host = host.toLowerCase();
        host = UrlUtil.decode(host);

        _bytes = tryDecodeHostToIp(host);

        if (_bytes != null) {
            InetAddress address;
            try {
                address = InetAddress.getByAddress(_bytes);
                String ipAddress = address.getHostAddress();
                if (address instanceof Inet6Address) {
                    host = "[" + ipAddress + "]";
                } else {
                    host = ipAddress;
                }
            } catch (UnknownHostException e) {
                return;
            }
        }

        if (StringUtils.isEmpty(host)) {
            return;
        }

        host = UrlUtil.removeExtraDots(host);

        _normalizedHost = UrlUtil.encode(host).replace("\\x", "%");
    }

    /**
     * Checks if the host is an ip address. Returns the byte representation of it
     */
    private byte[] tryDecodeHostToIp(String host) {
        if (host.startsWith("[") && host.endsWith("]")) {
            return tryDecodeHostToIPv6(host);
        }
        return tryDecodeHostToIPv4(host);
    }

    /**
     * This covers cases like:
     * Hexadecimal:        0x1283983
     * Decimal:            12839273
     * Octal:              037362273110
     * Dotted Decimal:     192.168.1.1
     * Dotted Hexadecimal: 0xfe.0x83.0x18.0x1
     * Dotted Octal:       0301.00.046.00
     * Dotted Mixed:       0x38.168.077.1
     *
     * if ipv4 was found, _bytes is set to the byte representation of the ipv4 address
     */
    private byte[] tryDecodeHostToIPv4(String host) {
        String[] parts = CharUtils.splitByDot(host);
        int numParts = parts.length;

        if (numParts != 4 && numParts != 1) {
            return null;
        }

        byte[] bytes = new byte[16];

        //An ipv4 mapped ipv6 bytes will have the 11th and 12th byte as 0xff
        bytes[10] = (byte) 0xff;
        bytes[11] = (byte) 0xff;
        for (int i = 0; i < parts.length; i++) {
            String parsedNum;
            int base;
            if (parts[i].startsWith("0x")) { //hex
                parsedNum = parts[i].substring(2);
                base = 16;
            } else if (parts[i].startsWith("0")) { //octal
                parsedNum = parts[i].substring(1);
                base = 8;
            } else { //decimal
                parsedNum = parts[i];
                base = 10;
            }

            Long section;
            try {
                section = parsedNum.isEmpty() ? 0 : Long.parseLong(parsedNum, base);
            } catch (NumberFormatException e) {
                return null;
            }

            if (numParts == 4 && section > MAX_IPV4_PART || //This would look like 288.1.2.4
                    numParts == 1 && section > MAX_NUMERIC_DOMAIN_VALUE || //This would look like 4294967299
                    section < MIN_IP_PART) {
                return null;
            }
            //bytes 13->16 is where the ipv4 address of an ipv4-mapped-ipv6-address is stored.
            if (numParts == 4) {
                bytes[IPV4_MAPPED_IPV6_START_OFFSET + i] = section.byteValue();
            } else { //numParts == 1
                int index = IPV4_MAPPED_IPV6_START_OFFSET;
                bytes[index++] = (byte) ((section >> 24) & 0xFF);
                bytes[index++] = (byte) ((section >> 16) & 0xFF);
                bytes[index++] = (byte) ((section >> 8) & 0xFF);
                bytes[index] = (byte) (section & 0xFF);
                return bytes;
            }
        }

        return bytes;
    }

    /**
     * Recommendation for IPv6 Address Text Representation
     * http://tools.ietf.org/html/rfc5952
     *
     * if ipv6 was found, _bytes is set to the byte representation of the ipv6 address
     */
    private byte[] tryDecodeHostToIPv6(String host) {
        String ip = host.substring(1, host.length() - 1);
        List<String> parts = new ArrayList<String>(Arrays.asList(ip.split(":", -1)));
        if (parts.size() < 3) {
            return null;
        }

        //Check for embedded ipv4 address
        String lastPart = parts.get(parts.size() - 1);
        int zoneIndexStart = lastPart.lastIndexOf("%");
        String lastPartWithoutZoneIndex = zoneIndexStart == -1 ? lastPart : lastPart.substring(0, zoneIndexStart);
        byte[] ipv4Address = null;
        if (!isHexSection(lastPartWithoutZoneIndex)) {
            ipv4Address = tryDecodeHostToIPv4(lastPartWithoutZoneIndex);
        }

        byte[] bytes = new byte[16];
        //How many parts do we need to fill by the end of this for loop?
        int totalSize = ipv4Address == null ? 8 : 6;
        //How many zeroes did we fill in the case of double colons? Ex: [::1] will have numberOfFilledZeroes = 7
        int numberOfFilledZeroes = 0;
        //How many sections do we have to parse through? Ex: [fe80:ff::192.168.1.1] size = 3, another ex: [a:a::] size = 4
        int size = ipv4Address == null ? parts.size() : parts.size() - 1;
        for (int i = 0; i < size; i++) {
            int lenPart = parts.get(i).length();
            if (lenPart == 0 && i != 0 && i != parts.size() - 1) {
                numberOfFilledZeroes = totalSize - size;
                for (int k = i; k < numberOfFilledZeroes + i; k++) {
                    System.arraycopy(sectionToTwoBytes(0), 0, bytes, k * 2, 2);
                }
            }
            Integer section;
            try {
                section = lenPart == 0 ? 0 : Integer.parseInt(parts.get(i), 16);
            } catch (NumberFormatException e) {
                return null;
            }
            if (section > MAX_IPV6_PART || section < MIN_IP_PART) {
                return null;
            }
            System.arraycopy(sectionToTwoBytes(section), 0, bytes, (numberOfFilledZeroes + i) * 2, 2);
        }

        if (ipv4Address != null) {
            System.arraycopy(ipv4Address, IPV4_MAPPED_IPV6_START_OFFSET, bytes, IPV4_MAPPED_IPV6_START_OFFSET,
                    NUMBER_BYTES_IN_IPV4);
        }
        return bytes;
    }

    private static boolean isHexSection(String section) {
        for (int i = 0; i < section.length(); i++) {
            if (!CharUtils.isHex(section.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    private static byte[] sectionToTwoBytes(int section) {
        byte[] bytes = new byte[2];
        bytes[0] = (byte) ((section >> 8) & 0xff);
        bytes[1] = (byte) (section & 0xff);
        return bytes;
    }

    protected byte[] getBytes() {
        return _bytes;
    }

    protected String getNormalizedHost() {
        return _normalizedHost;
    }
}