org.asoem.greyfish.utils.math.statistics.DefaultShapiroWilkTest.java Source code

Java tutorial

Introduction

Here is the source code for org.asoem.greyfish.utils.math.statistics.DefaultShapiroWilkTest.java

Source

/*
 * Copyright (C) 2015 The greyfish authors
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.asoem.greyfish.utils.math.statistics;

import org.apache.commons.math3.analysis.polynomials.PolynomialFunction;
import org.apache.commons.math3.distribution.NormalDistribution;
import org.apache.commons.math3.util.FastMath;

import java.util.Arrays;

import static org.apache.commons.math3.util.FastMath.*;

final class DefaultShapiroWilkTest implements ShapiroWilkTest {

    private static final double SMALL = 1e-19;

    private static final PolynomialFunction POLYNOMIAL_FUNCTION_GAMMA = new PolynomialFunction(
            new double[] { -2.273, .459 });
    private static final PolynomialFunction POLYNOMIAL_FUNCTION_1 = new PolynomialFunction(
            new double[] { 0., .221157, -.147981, -2.07119, 4.434685, -2.706056 });
    private static final PolynomialFunction POLYNOMIAL_FUNCTION_2 = new PolynomialFunction(
            new double[] { 0., .042981, -.293762, -1.752461, 5.682633, -3.582633 });
    private static final PolynomialFunction POLYNOMIAL_FUNCTION_3 = new PolynomialFunction(
            new double[] { .544, -.39978, .025054, -6.714e-4 });
    private static final PolynomialFunction POLYNOMIAL_FUNCTION_4 = new PolynomialFunction(
            new double[] { 1.3822, -.77857, .062767, -.0020322 });
    private static final PolynomialFunction POLYNOMIAL_FUNCTION_5 = new PolynomialFunction(
            new double[] { -1.5861, -.31082, -.083751, .0038915 });
    private static final PolynomialFunction POLYNOMIAL_FUNCTION_6 = new PolynomialFunction(
            new double[] { -.4803, -.082676, .0030302 });
    private static final int MAX_SAMPLE_SIZE = 5000;
    private static final int MIN_SAMPLE_SIZE = 3;
    private static final double SQRT_1DIV2 = 0.70710678;
    private static final NormalDistribution NORMAL_DISTRIBUTION_0_1 = new NormalDistribution(0.0, 1.0);
    private static final int EXACT_P_SAMPLE_SIZE = 3;
    private static final int LOWER_APPROXIMATION_MAX_SAMPLE_SIZE = 11;
    private static final double CLOSE_TO_0 = 1e-99;

    private final double pw;
    private double w;
    private static final double PI_6 = 1.90985931710274;
    private static final double STQR = 1.04719755119660;

    DefaultShapiroWilkTest(final double[] samples) {
        Arrays.sort(samples);
        int n = samples.length;

        int nn2 = (int) Math.floor(n / 2.0);
        double[] a = new double[nn2 + 1]; /* 1-based */

        /* Local variables */
        int i, j, i1;

        double ssassx, summ2, ssumm2, range;
        double a1, a2, sa, xi, sx, xx, w1;
        double fac, asa, an25, ssa, sax, rsn, ssx, xsx;

        if (n < MIN_SAMPLE_SIZE) {
            throw new IllegalArgumentException();
        }

        final double an = (double) n;

        if (n == 3) {
            a[1] = SQRT_1DIV2; /* = sqrt(1/2) */
        } else {
            an25 = an + 0.25;
            summ2 = 0.0;
            for (i = 1; i <= nn2; i++) {
                a[i] = NORMAL_DISTRIBUTION_0_1.inverseCumulativeProbability((i - 0.375) / an25);
                final double r = a[i];
                summ2 += r * r;
            }
            summ2 *= 2.0;
            ssumm2 = sqrt(summ2);
            rsn = 1.0 / sqrt(an);
            a1 = POLYNOMIAL_FUNCTION_1.value(rsn) - a[1] / ssumm2;

            /* Normalize a[] */
            if (n > 5) {
                i1 = 3;
                a2 = -a[2] / ssumm2 + POLYNOMIAL_FUNCTION_2.value(rsn);
                fac = sqrt((summ2 - 2. * (a[1] * a[1]) - 2.0 * (a[2] * a[2]))
                        / (1.0 - 2.0 * (a1 * a1) - 2. * (a2 * a2)));
                a[2] = a2;
            } else {
                i1 = 2;
                fac = sqrt((summ2 - 2. * (a[1] * a[1])) / (1.0 - 2.0 * (a1 * a1)));
            }
            a[1] = a1;
            for (i = i1; i <= nn2; i++) {
                a[i] /= -fac;
            }
        }

        /* Check for zero range */
        range = samples[n - 1] - samples[0];
        if (range < SMALL) {
            throw new IllegalArgumentException("Range to small");
        }

        /* Check for correct sort order on range - scaled X */

        /* *ifault = 7; <-- a no-op, since it is changed below, in ANY CASE! */
        xx = samples[0] / range;
        sx = xx;
        sa = -a[1];
        for (i = 1, j = n - 1; i < n; j--) {
            xi = samples[i] / range;
            if (xx - xi > SMALL) {
                /* Fortran had: print *, "ANYTHING"
                * but do NOT; it *does* happen with sorted x (on Intel GNU/linux 32bit):
                * shapiro.test(c(-1.7, -1,-1,-.73,-.61,-.5,-.24,.45,.62,.81,1))
                */
                throw new IllegalArgumentException();
            }
            sx += xi;
            i++;
            if (i != j) {
                sa += FastMath.signum(i - j) * a[min(i, j)];
            }
            xx = xi;
        }
        if (n > MAX_SAMPLE_SIZE) {
            throw new IllegalArgumentException();
        }

        /* Calculate W statistic as squared correlation
        between data and coefficients */

        sa /= n;
        sx /= n;
        ssa = 0.0;
        ssx = 0.0;
        sax = 0.0;
        for (i = 0, j = n - 1; i < n; i++, j--) {
            if (i != j) {
                asa = FastMath.signum(i - j) * a[1 + min(i, j)] - sa;
            } else {
                asa = -sa;
            }
            xsx = samples[i] / range - sx;
            ssa += asa * asa;
            ssx += xsx * xsx;
            sax += asa * xsx;
        }

        /* W1 equals (1-W) calculated to avoid excessive rounding error
        for W very near 1 (a potential problem in very large samples) */

        ssassx = sqrt(ssa * ssx);
        w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
        w = 1.0 - w1;

        pw = significance(w, samples.length);
    }

    /**
     * Calculate significance level for W
     */
    private static double significance(final double w, final int n) {

        if (n == EXACT_P_SAMPLE_SIZE) { /* exact P value */
            double pw = PI_6 * (asin(sqrt(w)) - STQR);
            if (pw < 0.0) {
                pw = 0.0;
            }
            return pw;
        }

        double y = log(1 - w);
        final double an = (double) n;
        final double mean;
        final double sd;

        if (n <= LOWER_APPROXIMATION_MAX_SAMPLE_SIZE) {
            final double gamma = POLYNOMIAL_FUNCTION_GAMMA.value(an);
            if (y >= gamma) {
                return CLOSE_TO_0; /* an "obvious" value, was 'SMALL' which was 1e-19f */
            }
            y = -log(gamma - y);
            mean = POLYNOMIAL_FUNCTION_3.value(an);
            sd = exp(POLYNOMIAL_FUNCTION_4.value(an));
        } else { /* n >= 12 */
            final double x = log(an);
            mean = POLYNOMIAL_FUNCTION_5.value(x);
            sd = exp(POLYNOMIAL_FUNCTION_6.value(x));
        }

        return new NormalDistribution(mean, sd).probability(y, Double.POSITIVE_INFINITY);
    }

    @Override
    public double statistics() {
        return w;
    }

    @Override
    public double p() {
        return pw;
    }
}