Source code

Java tutorial


Here is the source code for


 * Copyright [2012-2014] PayPal Software Foundation
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package ml.shifu.shifu.core;

import ml.shifu.shifu.container.obj.ColumnConfig;
import ml.shifu.shifu.container.obj.ModelNormalizeConf;
import ml.shifu.shifu.udf.NormalizeUDF.CategoryMissingNormType;
import ml.shifu.shifu.util.BinUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Arrays;
import java.util.List;
import java.util.Map;

 * Util normalization class which is used for any kind of transformation.
public class Normalizer {

    private static Logger log = LoggerFactory.getLogger(Normalizer.class);
    public static final double STD_DEV_CUTOFF = 4.0d;

    public enum NormalizeMethod {
         * Normalize methods.
        ZScore, MaxMin

    private ColumnConfig config;
    private Double stdDevCutOff = 4.0;
    private NormalizeMethod method;

     * Create @Normalizer, according ColumnConfig
     * NormalizeMethod method will be NormalizeMethod.ZScore
     * stdDevCutOff will be STD_DEV_CUTOFF
     * @param config
     *            ColumnConfig to create normalizer
    public Normalizer(ColumnConfig config) {
        this(config, NormalizeMethod.ZScore, STD_DEV_CUTOFF);

     * Create @Normalizer, according ColumnConfig and NormalizeMethod
     * stdDevCutOff will be STD_DEV_CUTOFF
     * @param config
     *            ColumnConfig to create normalizer
     * @param method
     *            NormalizMethod to use
    public Normalizer(ColumnConfig config, NormalizeMethod method) {
        this(config, method, STD_DEV_CUTOFF);

     * Create @Normalizer, according ColumnConfig and NormalizeMethod
     * NormalizeMethod method will be NormalizeMethod.ZScore
     * @param config
     *            ColumnConfig to create normalizer
     * @param cutoff
     *            stand_dev_cutoff to use
    public Normalizer(ColumnConfig config, Double cutoff) {
        this(config, NormalizeMethod.ZScore, STD_DEV_CUTOFF);

     * Create @Normalizer, according ColumnConfig and NormalizeMethod
     * NormalizeMethod method will be NormalizeMethod.ZScore
     * @param config
     *            ColumnConfig to create normalizer
     * @param method
     *            NormalizMethod to use
     * @param cutoff
     *            stand_dev_cutoff to use
    public Normalizer(ColumnConfig config, NormalizeMethod method, Double cutoff) {
        this.config = config;
        this.method = method;
        this.stdDevCutOff = cutoff;

     * Normalize the input data for column
     * @param raw
     *            the raw value
     * @return normalized value
    public List<Double> normalize(Object raw) {
        return normalize(config, raw, method, stdDevCutOff);

     * Normalize the raw file, according the ColumnConfig info
     * @param config
     *            ColumnConfig to normalize data
     * @param raw
     *            raw input data
     * @return normalized value
    public static List<Double> normalize(ColumnConfig config, Object raw) {
        return normalize(config, raw, NormalizeMethod.ZScore);

     * Normalize the raw file, according the ColumnConfig info and normalized method
     * @param config
     *            ColumnConfig to normalize data
     * @param raw
     *            raw input data
     * @param method
     *            the method used to do normalization
     * @return normalized value
    public static List<Double> normalize(ColumnConfig config, Object raw, NormalizeMethod method) {
        return normalize(config, raw, method, STD_DEV_CUTOFF);

     * Normalize the raw file, according the ColumnConfig info and standard deviation cutoff
     * @param config
     *            ColumnConfig to normalize data
     * @param raw
     *            raw input data
     * @param stdDevCutoff
     *            the standard deviation cutoff to use
     * @return normalized value
    public static List<Double> normalize(ColumnConfig config, Object raw, double stdDevCutoff) {
        return normalize(config, raw, NormalizeMethod.ZScore, stdDevCutoff);

     * Normalize the raw file, according the ColumnConfig info, normalized method and standard deviation cutoff
     * @param config
     *            ColumnConfig to normalize data
     * @param raw
     *            raw input data
     * @param method
     *            the method used to do normalization
     * @param stdDevCutoff
     *            the standard deviation cutoff to use
     * @return normalized value
    public static List<Double> normalize(ColumnConfig config, Object raw, NormalizeMethod method,
            double stdDevCutoff) {
        if (method == null) {
            method = NormalizeMethod.ZScore;

        switch (method) {
        case ZScore:
            return zScoreNormalize(config, raw, stdDevCutoff);
        case MaxMin:
            return Arrays.asList(getMaxMinScore(config, raw));
            return Arrays.asList(new Double[] { 0.0 });

     * Compute the normalized data for @NormalizeMethod.MaxMin
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @return normalized value for MaxMin method
    private static Double[] getMaxMinScore(ColumnConfig config, Object raw) {
        if (config.isCategorical()) {
            // TODO, doesn't support
        } else {
            Double value = null;
            if (raw instanceof Double) {
                value = (Double) raw;
            } else if (raw instanceof Integer) {
                value = ((Integer) raw).doubleValue();
            } else {
                value = Double.parseDouble((String) raw);
            return new Double[] { (value - config.getColumnStats().getMin())
                    / (config.getColumnStats().getMax() - config.getColumnStats().getMin()) };
        return null;

     * Normalize the raw data, according the ColumnConfig information and normalization type.
     * Currently, the cutoff value doesn't affect the computation of WOE or WEIGHT_WOE type.
     * <p>
     * Noticed: currently OLD_ZSCALE and ZSCALE is implemented with the same process method.
     * </p>
     * @param config
     *            ColumnConfig to normalize data
     * @param raw
     *            raw input data
     * @param cutoff
     *            standard deviation cut off
     * @param type
     *            normalization type of ModelNormalizeConf.NormType
     * @param categoryMissingNormType
     *            missing categorical value norm type
     * @return normalized value. If normType parameter is invalid, then the ZSCALE will be used as default.
    public static List<Double> normalize(ColumnConfig config, Object raw, Double cutoff,
            ModelNormalizeConf.NormType type, CategoryMissingNormType categoryMissingNormType) {
        switch (type) {
        case ASIS_WOE:
            return asIsNormalize(config, raw, true);
        case ASIS_PR:
            return asIsNormalize(config, raw, false);
        case WOE:
            return woeNormalize(config, raw, false);
        case WEIGHT_WOE:
            return woeNormalize(config, raw, true);
        case HYBRID:
            return hybridNormalize(config, raw, cutoff, false);
        case WEIGHT_HYBRID:
            return hybridNormalize(config, raw, cutoff, true);
        case WOE_ZSCORE:
        case WOE_ZSCALE:
            return woeZScoreNormalize(config, raw, cutoff, false);
        case WEIGHT_WOE_ZSCORE:
        case WEIGHT_WOE_ZSCALE:
            return woeZScoreNormalize(config, raw, cutoff, true);
        case ONEHOT:
            return OneHotNormalize(config, raw);
        case ZSCALE_ONEHOT:
            return zscaleOneHotNormalize(config, raw, cutoff, categoryMissingNormType);
        case DISCRETE_ZSCORE:
        case DISCRETE_ZSCALE:
            return discreteZScoreNormalize(config, raw, cutoff, categoryMissingNormType);
        case OLD_ZSCALE:
        case OLD_ZSCORE:
            return zScoreNormalize(config, raw, cutoff, categoryMissingNormType, true);
        case ZSCALE:
        case ZSCORE:
            return zScoreNormalize(config, raw, cutoff, categoryMissingNormType, false);

     * Adding new API with cateIndeMap parameter without change normalize API.
     * @param config
     *              the ColumnConfig
     * @param raw
     *              the raw input
     * @param cutoff
     *              the cutoff value
     * @param type
     *              normalize type
     * @param categoryMissingNormType
     *              the category missing normal type
     * @param cateIndexMap
     *              the cateIndexMap map from category to index
     * @return normalized value
    public static List<Double> fullNormalize(ColumnConfig config, Object raw, Double cutoff,
            ModelNormalizeConf.NormType type, CategoryMissingNormType categoryMissingNormType,
            Map<String, Integer> cateIndexMap) {
        switch (type) {
        case ZSCORE_INDEX:
        case ZSCALE_INDEX:
            return numZScoreAndCateIndexNorm(config, raw, cutoff, cateIndexMap);
        case WOE_INDEX:
            if (config.isNumerical()) {
                return woeNormalize(config, raw, false);
            } else if (config.isCategorical()) {
                Integer index = cateIndexMap.get(raw == null ? "" : raw.toString());
                if (index == null || index == -1) {
                    // last index for null category
                    index = config.getBinCategory().size();
                return Arrays.asList((double) index);
        case WOE_ZSCALE_INDEX:
            if (config.isNumerical()) {
                return woeZScoreNormalize(config, raw, cutoff, false);
            } else if (config.isCategorical()) {
                Integer index = cateIndexMap.get(raw == null ? "" : raw.toString());
                if (index == null || index == -1) {
                    // last index for null category
                    index = config.getBinCategory().size();
                return Arrays.asList((double) index);
            // others use old normalize API to reuse code
            return normalize(config, raw, cutoff, type, categoryMissingNormType);

     * Compute the normalized data for @NormalizeMethod.Zscore
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param cutoff
     *            standard deviation cut off
     * @param categoryMissingNormType
     *            missing categorical value norm type
     * @return normalized value for ZScore method.
    private static List<Double> numZScoreAndCateIndexNorm(ColumnConfig config, Object raw, Double cutoff,
            Map<String, Integer> cateIndexMap) {
        if (config.isNumerical()) {
            double stdDevCutOff = checkCutOff(cutoff);
            double value = parseRawValue(config, raw, null);
            return Arrays.asList(computeZScore(value, config.getMean(), config.getStdDev(), stdDevCutOff));
        } else if (config.isCategorical()) {
            Integer index = cateIndexMap.get(raw == null ? "" : raw.toString());
            if (index == null || index == -1) {
                // last index for null category
                index = config.getBinCategory().size();
            return Arrays.asList(((double) index));
        } else {
            throw new IllegalArgumentException("Not supported norm column type.");

    private static List<Double> asIsNormalize(ColumnConfig config, Object raw, boolean toUseWoe) {
        if (config.isNumerical()) {
            Double values[] = new Double[1];
            if (raw instanceof Double) {
                values[0] = (Double) raw;
            } else if (raw instanceof Integer) {
                values[0] = ((Integer) raw).doubleValue();
            } else {
                try {
                    values[0] = Double.parseDouble(raw.toString());
                } catch (Exception e) {
                    log.warn("Illegal numerical value - {}, use mean instead.", raw);
                    values[0] = config.getMean();

            return Arrays.asList(values);
        } else {
            // categorical variables
            List<Double> normVals = (toUseWoe ? config.getBinCountWoe() : config.getBinPosRate());
            int binIndex = BinUtils.getBinNum(config, raw);
            return ((binIndex == -1) ? Arrays.asList(new Double[] { normVals.get(normVals.size() - 1) })
                    : Arrays.asList(new Double[] { normVals.get(binIndex) }));

    private static List<Double> OneHotNormalize(ColumnConfig config, Object raw) {
        Double[] normData = (config.isNumerical() ? new Double[config.getBinBoundary().size() + 1]
                : new Double[config.getBinCategory().size() + 1]);
        Arrays.fill(normData, 0.0d);
        int binNum = BinUtils.getBinNum(config, raw);
        if (binNum < 0) {
            binNum = normData.length - 1;
        normData[binNum] = 1.0d;
        return Arrays.asList(normData);

    private static List<Double> zscaleOneHotNormalize(ColumnConfig config, Object raw, Double cutoff,
            CategoryMissingNormType categoryMissingNormType) {
        if (config.isNumerical()) {
            return zScoreNormalize(config, raw, cutoff, categoryMissingNormType, false);
        } else {
            Double[] normData = new Double[config.getBinCategory().size() + 1];
            Arrays.fill(normData, 0.0d);

            int binNum = BinUtils.getBinNum(config, raw);
            if (binNum < 0) {
                binNum = config.getBinCategory().size();
            normData[binNum] = 1.0d;
            return Arrays.asList(normData);

     * Normalize the raw data, according the ColumnConfig information and normalization type.
     * Currently, the cutoff value doesn't affect the computation of WOE or WEIGHT_WOE type.
     * <p>
     * Noticed: currently OLD_ZSCALE and ZSCALE is implemented with the same process method.
     * </p>
     * @param config
     *            ColumnConfig to normalize data
     * @param raw
     *            raw input data
     * @param cutoff
     *            standard deviation cut off
     * @param type
     *            normalization type of ModelNormalizeConf.NormType
     * @return normalized value. If normType parameter is invalid, then the ZSCALE will be used as default.
    public static List<Double> normalize(ColumnConfig config, Object raw, Double cutoff,
            ModelNormalizeConf.NormType type) {
        return normalize(config, raw, cutoff, type, CategoryMissingNormType.POSRATE);

     * Compute the normalized data for @NormalizeMethod.Zscore
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param cutoff
     *            standard deviation cut off
     * @param categoryMissingNormType
     *            missing categorical value norm type
     * @return normalized value for ZScore method.
    private static List<Double> zScoreNormalize(ColumnConfig config, Object raw, Double cutoff,
            CategoryMissingNormType categoryMissingNormType, boolean isOld) {
        double stdDevCutOff = checkCutOff(cutoff);
        double value = parseRawValue(config, raw, categoryMissingNormType);
        if (isOld && config.isCategorical()) {
            return Arrays.asList(value);
        return Arrays.asList(computeZScore(value, config.getMean(), config.getStdDev(), stdDevCutOff));

     * Compute the zscore value after do discreting in each bin for numerical value, for categorical feature, use
     * positive rate.
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param cutoff
     *            standard deviation cut off
     * @param categoryMissingNormType
     *            missing categorical value norm type
     * @return normalized value for ZScore method.
    private static List<Double> discreteZScoreNormalize(ColumnConfig config, Object raw, Double cutoff,
            CategoryMissingNormType categoryMissingNormType) {
        double stdDevCutOff = checkCutOff(cutoff);
        double value = 0;
        if (config.isCategorical()) {
            value = parseRawValue(config, raw, categoryMissingNormType);
        } else {
            int binIndex = BinUtils.getBinNum(config, raw);
            if (binIndex < 0 || binIndex >= config.getBinBoundary().size()) {
                // missing value, use mean value, after zscore, it is 0
                value = config.getMean();
            } else {
                List<Double> binBoundaries = config.getBinBoundary();
                if (binIndex == 0) {
                    // the first bin, use min value
                    value = config.getColumnStats().getMin();
                } else {
                    value = binBoundaries.get(binIndex);
        return Arrays.asList(computeZScore(value, config.getMean(), config.getStdDev(), stdDevCutOff));

     * Compute the normalized data for @NormalizeMethod.Zscore
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param cutoff
     *            standard deviation cut off
     * @return normalized value for ZScore method.
    private static List<Double> zScoreNormalize(ColumnConfig config, Object raw, Double cutoff) {
        double stdDevCutOff = checkCutOff(cutoff);
        double value = parseRawValue(config, raw, CategoryMissingNormType.POSRATE);
        return Arrays.asList(computeZScore(value, config.getMean(), config.getStdDev(), stdDevCutOff));

     * Parse raw value based on ColumnConfig.
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param categoryMissingNormType
     *            missing categorical value norm type
     * @return parsed raw value. For categorical type, return BinPosRate. For numerical type, return
     *         corresponding double value. For missing data, return default value using
     *         {@link Normalizer#defaultMissingValue}.
    private static double parseRawValue(ColumnConfig config, Object raw,
            CategoryMissingNormType categoryMissingNormType) {
        if (categoryMissingNormType == null) {
            categoryMissingNormType = CategoryMissingNormType.POSRATE;
        double value = 0.0;
        if (raw == null || StringUtils.isBlank(raw.toString())) {
            log.debug("Not decimal format but null, using default!");
            if (config.isCategorical()) {
                value = fillDefaultValue(config, categoryMissingNormType);
            } else {
                value = defaultMissingValue(config);
            return value;

        if (config.isCategorical()) {
            // for categorical variable, no need convert to double but double should be in treated as String in
            // categorical variables
            int index = BinUtils.getBinNum(config, raw);
            if (index == -1) {
                value = fillDefaultValue(config, categoryMissingNormType);
            } else {
                Double binPosRate = config.getBinPosRate().get(index);
                if (binPosRate != null) {
                    value = binPosRate.doubleValue();
                } else {
                    value = fillDefaultValue(config, categoryMissingNormType);
        } else {
            // for numerical value, if double or int, no need parse again.
            if (raw instanceof Double) {
                value = (Double) raw;
            } else if (raw instanceof Integer) {
                value = ((Integer) raw).doubleValue();
            } else if (raw instanceof Float) {
                value = ((Float) raw).doubleValue();
            } else {
                try {
                    // if raw is NaN, it won't throw Exception. The value will be Double.NaN
                    value = Double.parseDouble(raw.toString());
                } catch (Exception e) {
                    log.debug("Not decimal format " + raw + ", using default!");
                    value = defaultMissingValue(config);
            if (Double.isInfinite(value) || Double.isNaN(value)) {
                // if the value is Infinite or NaN, treat it as missing value
                // should treat Infinite as missing value?
                value = defaultMissingValue(config);

        return value;

    private static double fillDefaultValue(ColumnConfig config, CategoryMissingNormType categoryMissingNormType) {
        double value = 0.0;
        switch (categoryMissingNormType) {
        case POSRATE:
            // last one is missing bin, if it is missing, using pos rate for default value.
            value = config.getBinPosRate().get(config.getBinPosRate().size() - 1);
        case MEAN:
            value = defaultMissingValue(config);

        return value;

     * Get the default value for missing data.
     * @param config
     *            ColumnConfig info
     * @return default value for missing data. Now simply return Mean value. If mean is null then return 0.
    public static double defaultMissingValue(ColumnConfig config) {
        // TODO return 0 for mean == null is correct or reasonable?
        return config.getMean() == null ? 0 : config.getMean().doubleValue();

     * Compute the normalized data for Woe Score.
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param isWeightedNorm
     *            if use weighted woe
     * @return normalized value for Woe method. For missing value, we return the value in last bin. Since the last
     *         bin refers to the missing value bin.
    private static List<Double> woeNormalize(ColumnConfig config, Object raw, boolean isWeightedNorm) {
        List<Double> woeBins = isWeightedNorm ? config.getBinWeightedWoe() : config.getBinCountWoe();
        int binIndex = 0;
        if (config.isHybrid()) {
            if (raw == null) {
                binIndex = -1;
            } else {
                binIndex = BinUtils.getCategoicalBinIndex(config, raw.toString());

            if (binIndex != -1) {
                binIndex = binIndex + config.getBinBoundary().size(); // append the first numerical bins
            } else {
                double douVal = BinUtils.parseNumber(raw);
                if (Double.isNaN(douVal)) {
                    binIndex = config.getBinBoundary().size() + config.getBinCategory().size();
                } else {
                    binIndex = BinUtils.getBinIndex(config.getBinBoundary(), douVal);
        } else {
            binIndex = BinUtils.getBinNum(config, raw);
        if (binIndex == -1) {
            // The last bin in woeBins is the miss value bin.
            return Arrays.asList(new Double[] { woeBins.get(woeBins.size() - 1) });
        } else {
            return Arrays.asList(new Double[] { woeBins.get(binIndex) });

     * Compute the normalized value for woe zscore normalize.Take woe as variable value and using zscore normalizing
     * to compute zscore of woe.
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param cutoff
     *            standard deviation cut off
     * @param isWeightedNorm
     *            if use weighted woe
     * @return normalized value for woe zscore method.
    private static List<Double> woeZScoreNormalize(ColumnConfig config, Object raw, Double cutoff,
            boolean isWeightedNorm) {
        double stdDevCutOff = checkCutOff(cutoff);
        double woe = woeNormalize(config, raw, isWeightedNorm).get(0);
        // TODO cache such computing to avoid computing each time
        double[] meanAndStdDev = calculateWoeMeanAndStdDev(config, isWeightedNorm);
        return Arrays.asList(computeZScore(woe, meanAndStdDev[0], meanAndStdDev[1], stdDevCutOff));

     * Compute the normalized data for hybrid normalize. Use zscore noramlize for numerical data. Use woe normalize
     * for categorical data while use weight woe normalize when isWeightedNorm is true.
     * @param config
     *            ColumnConfig info
     * @param raw
     *            input column value
     * @param cutoff
     *            standard deviation cut off
     * @param isWeightedNorm
     *            if use weighted woe
     * @return normalized value for hybrid method.
    private static List<Double> hybridNormalize(ColumnConfig config, Object raw, Double cutoff,
            boolean isWeightedNorm) {
        List<Double> normValue;
        if (config.isNumerical()) {
            // For numerical data, use zscore.
            normValue = zScoreNormalize(config, raw, cutoff);
        } else {
            // For categorical data, use woe.
            normValue = woeNormalize(config, raw, isWeightedNorm);

        return normValue;

     * Check specified standard deviation cutoff and return the correct value.
     * @param cutoff
     *            specified standard deviation cutoff
     * @return If cutoff is valid then return it, else return {@link Normalizer#STD_DEV_CUTOFF}
    public static double checkCutOff(Double cutoff) {
        double stdDevCutOff;
        if (cutoff != null && !cutoff.isInfinite() && !cutoff.isNaN()) {
            stdDevCutOff = cutoff;
        } else {
            stdDevCutOff = STD_DEV_CUTOFF;

        return stdDevCutOff;

     * Calculate woe mean and woe standard deviation.
     * @param config
     *            ColumnConfig info
     * @param isWeightedNorm
     *            if use weighted woe
     * @return an double array contains woe mean and woe standard deviation as order {mean, stdDev}
    public static double[] calculateWoeMeanAndStdDev(ColumnConfig config, boolean isWeightedNorm) {
        List<Double> woeList = isWeightedNorm ? config.getBinWeightedWoe() : config.getBinCountWoe();
        if (woeList == null || woeList.size() < 2) {
            throw new IllegalArgumentException("Woe list is null or too short(size < 2)");

        List<Integer> negCountList = config.getBinCountNeg();
        List<Integer> posCountList = config.getBinCountPos();

        // calculate woe mean and standard deviation
        int size = woeList.size();
        double sum = 0.0;
        double squaredSum = 0.0;
        long totalCount = 0;
        for (int i = 0; i < size; i++) {
            int count = negCountList.get(i) + posCountList.get(i);
            totalCount += count;
            double x = woeList.get(i);
            sum += x * count;
            squaredSum += x * x * count;

        double woeMean = sum / totalCount;
        double woeStdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / totalCount) / (totalCount - 1)));

        return new double[] { woeMean, woeStdDev };

     * Compute the zscore, by original value, mean, standard deviation and standard deviation cutoff
     * @param var
     *            original value
     * @param mean
     *            mean value
     * @param stdDev
     *            standard deviation
     * @param stdDevCutOff
     *            standard deviation cutoff
     * @return zscore
    public static Double[] computeZScore(double var, double mean, double stdDev, double stdDevCutOff) {
        double maxCutOff = mean + stdDevCutOff * stdDev;
        if (var > maxCutOff) {
            var = maxCutOff;

        double minCutOff = mean - stdDevCutOff * stdDev;
        if (var < minCutOff) {
            var = minCutOff;

        if (stdDev > 0.00001) {
            return new Double[] { (var - mean) / stdDev };
        } else {
            return new Double[] { 0.0 };
