Example usage for org.apache.commons.math3.distribution FDistribution cumulativeProbability

List of usage examples for org.apache.commons.math3.distribution FDistribution cumulativeProbability

Introduction

In this page you can find the example usage for org.apache.commons.math3.distribution FDistribution cumulativeProbability.

Prototype

public double cumulativeProbability(double x) 

Source Link

Document

The implementation of this method is based on <ul> <li> <a href="http://mathworld.wolfram.com/F-Distribution.html"> F-Distribution</a>, equation (4).

Usage

From source file:dase.timeseries.analysis.GrangerTest.java

/**
 * Returns p-value for Granger causality test.
 *
 * @param y/*from  w ww.  ja  va 2 s .co m*/
 *            - predictable variable
 * @param x
 *            - predictor
 * @param L
 *            - lag, should be 1 or greater.
 * @return p-value of Granger causality
 */
public static double granger(double[] y, double[] x, int L) {
    OLSMultipleLinearRegression h0 = new OLSMultipleLinearRegression();
    OLSMultipleLinearRegression h1 = new OLSMultipleLinearRegression();

    double[][] laggedY = createLaggedSide(L, y);

    double[][] laggedXY = createLaggedSide(L, x, y);

    int n = laggedY.length;

    h0.newSampleData(strip(L, y), laggedY);
    h1.newSampleData(strip(L, y), laggedXY);

    double rs0[] = h0.estimateResiduals();
    double rs1[] = h1.estimateResiduals();

    double RSS0 = sqrSum(rs0);
    double RSS1 = sqrSum(rs1);

    double ftest = ((RSS0 - RSS1) / L) / (RSS1 / (n - 2 * L - 1));

    System.out.println(RSS0 + " " + RSS1);
    System.out.println("F-test " + ftest);

    FDistribution fDist = new FDistribution(L, n - 2 * L - 1);

    double pValue = 1.0 - fDist.cumulativeProbability(ftest);
    System.out.println("P-value " + pValue);
    return pValue;
}

From source file:com.rapidminer.operator.validation.significance.TTestSignificanceTestOperator.java

private double getProbability(PerformanceCriterion pc1, PerformanceCriterion pc2) {
    double totalDeviation = ((pc1.getAverageCount() - 1) * pc1.getVariance()
            + (pc2.getAverageCount() - 1) * pc2.getVariance())
            / (pc1.getAverageCount() + pc2.getAverageCount() - 2);
    double factor = 1.0d / (1.0d / pc1.getAverageCount() + 1.0d / pc2.getAverageCount());
    double diff = pc1.getAverage() - pc2.getAverage();
    double t = factor * diff * diff / totalDeviation;
    int secondDegreeOfFreedom = pc1.getAverageCount() + pc2.getAverageCount() - 2;
    double prob;/* www .ja  v  a  2s.com*/
    // make sure the F-distribution is well defined
    if (secondDegreeOfFreedom > 0) {
        FDistribution fDist = new FDistribution(1, secondDegreeOfFreedom);
        prob = 1 - fDist.cumulativeProbability(t);
    } else {
        // in this case the probability cannot calculated correctly and a 1 is returned, as
        // this result is not significant
        prob = 1;
    }

    return prob;
}

From source file:gedi.util.math.stat.testing.OneWayAnova.java

public double getPvalue() {
    FDistribution fdist = new FDistribution(getFactorDof(), getErrorDof());
    if (getFactorDof() < 1 || getErrorDof() < 1)
        return Double.NaN;
    return 1 - fdist.cumulativeProbability(getFStatistic());
}

From source file:edu.cudenver.bios.power.glmm.GLMMTest.java

/**
 * Fit the model//  w w  w  .j a va2 s. c  o m
 * @return model fit object
 */
public ModelFit getModelFit() {
    double fobs = getObservedF(GLMMTest.DistributionType.DATA_ANALYSIS_NULL);

    // get the p-value from a central F distribution
    double ndf = getNumeratorDF(GLMMTest.DistributionType.DATA_ANALYSIS_NULL);
    if (Double.isNaN(ndf)) {
        throw new IllegalArgumentException("numerator DF is NaN");
    }
    double ddf = getDenominatorDF(GLMMTest.DistributionType.DATA_ANALYSIS_NULL);
    if (Double.isNaN(ddf)) {
        throw new IllegalArgumentException("denominator DF is NaN");
    }

    FDistribution fdist = new FDistribution(ndf, ddf);
    double pvalue = 1 - fdist.cumulativeProbability(fobs);

    return new ModelFit(pvalue, fobs, ndf, ddf, sigmaError, beta);
}

From source file:cs.man.ac.uk.stats.ComputeANOVAStats.java

/**
 * Performs an ANOVA analysis on the data read in.
 * /*from w  ww  . ja  v a 2s  . co  m*/
 * @param outputPath the path to output details of the ANOVA analysis to.
 */
private static void ANOVAAnalysis(String outputPath) {
    /**
     * OUTPUT FILE PREPARATION
     */

    // Clear up output path.
    Common.fileDelete(outputPath);

    String tukeyOutputPath = outputPath.replace(".csv", "_HSD.csv");
    Common.fileDelete(tukeyOutputPath);
    Writer.append(tukeyOutputPath,
            "Result 1,Result 2,Test,Metric,MSwg,DFwg,n,alpha,HSD,H0 (1=rejected & 0=accepted),Outcome\n");

    // Write header information to output path
    String[] headers = header.split(",");
    Writer.append(outputPath, headers[0] + ",");

    for (int i = 1; i < headers.length; i++)
        Writer.append(outputPath,
                headers[i] + ",F-ratio,P-value,Fb,Fw,H0 (1=rejected & 0=accepted), alpha=" + alpha + ",");

    Writer.append(outputPath, "\n");

    /**
     * PERFROM ANOVA
     */

    for (Map.Entry<String, Vector<ANOVA>> entry : anovaObjects.entrySet()) {
        String key = entry.getKey();
        Vector<ANOVA> vector = entry.getValue();

        /**
         * OK, its crucial to understand what is going on here. We have a number of files
         * containing results of algorithm tests. Each File will contain the results of
         * a number of different tests using different parameters. Note that each files contains
         * results for a specific algorithm only.
         * 
         * Now if we want to perform ANOVA analysis on results from multiple algorithms, we
         * must analyze the results in these files together rather than in isolation. So we have
         * the following situation: n files, containing results of m tests, occurring in the same order in each
         * file. These are directly comparable results, for instance:
         * 
         *    FILE 1               FILE 2               FILE N
         *  Test 1               Test 1               Test 1   -|
         *  Test 1               Test 1               Test 1   |
         *  Test 1               Test 1               Test 1   |---> Test Block 1 (i.e. multiple runs of same test)
         *  Test 1               Test 1               Test 1   |
         *  Test 1               Test 1               Test 1   -|
         *  
         *  Test 2               Test 2               Test 2
         *  Test 2               Test 2               Test 2
         *  Test 2               Test 2               Test 2
         *  Test 2               Test 2               Test 2
         *  Test 2               Test 2               Test 2
         *  
         *  Test n               Test n               Test n   -|
         *  Test n               Test n               Test n    |
         *  Test n               Test n               Test n    |---> Test Block n
         *  Test n               Test n               Test n    |
         *  Test n               Test n               Test n   -|
         *  
         *  ** Note each test result is made up of a number of recorded metrics. For instance Test Block 1 in file 1
         *  would look something like (with four metrics recorded during testing lets say TP,TN,FP,FN):
         *  
         *  120   ,   6   ,   5   ,   3   -|---> Run 1 --->
         *  118   ,   7   ,   6   ,   4    |            |
         *  122   ,   8   ,   7   ,   5    |            |---> Test Block 1.
         *  130   ,   12   ,   5   ,   13    |            |
         *  100   ,   2   ,   5   ,   7   -|---> Run 5 --->
         *  
         *  The results of each test are actually described in terms of k variables (typically k=16). These variables
         *  include the true positives (TP), false positives (FP), accuracy, f-score etc. Thus to compare the results
         *  we need to do ANOVA analysis using data occurring in each of the three files. However we can only compare
         *  like with like. So in the example above we can only perform ANOVA on comparable test blocks. In
         *  which case ANOVA would be performed on Test 1 data in files 1, 2 and 3, then Test 2 data and so on.
         *  At no point would Test 1 data in any of the files be compared with Test 2 data for example.
         *  
         *  The code below does this. Using arrays we perform ANOVA on each metric
         *  in the test blocks. Clearly this makes the code below somewhat complicated to understand!
         *  I'm genuinely sorry for that, the main reason is because I may have to perform this type of 
         *  analysis many thousands of times. But I'll try to explain how it works.
         *  
         *  For each Test block, in each file, an ANOVA object is generated in code above ( in the process() method).
         *  Each ANOVA object essentially contains a matrix of the data collected in a test block. These ANOVA objects
         *  have methods that enable them to calculate the mean and sum of the values in their matrix. For instance,
         *  Test 1 involves ten runs of the same test. For each test, lets say we collect 4 pieces of data, the number of 
         *  true positives, true negatives, false positives and false negatives. An ANOVA object for Test 1 for File 1
         *  will contain a matrix of this information, and calculate the means/sums of these four variables storing them in:
         *  
         *  private double sums[];
         *   private double means[];
         *
         *  So then,
         *  
         *  sums[0] contains the sum of true positives.
         *  sums[1] contains the sum of true negatives.
         *  sums[2] contains the sum of false positives.
         *  sums[3] contains the sum of false negatives.  
         *  
         *  And likewise for the means.
         *  
         *  When the process() method terminates we have a number of ANOVA objects stored in a TreeMap structure,
         *  which groups comparable ANOVA objects by storing them in the same vector. 
         *  
         *  Here then we begin iterating through this tree map, and calculate the F-ratio for comparable ANOVA objects.
         *  This way we can calculate all the ANOVA results automatically, for every variable we have.
         */

        /*
         * ANOVA WORKED EXAMPLE (credit Wikipedia!).
         * 
         * Consider an experiment to study the effect of three different levels of a factor
         * on a response (e.g. three levels of a fertilizer on plant growth). If we had 6 observations
         * for each level, we could write the outcome of the experiment in a table like this, 
         * where a1, a2, and a3 are the three levels of the factor being studied. 
         * 
         * a1   a2   a3
         * 6   8   13
         * 8   12   9
         * 4   9   11
         * 5   11   8
         * 3   6   7
         * 4   8   12
         * 
         * The null hypothesis, denoted H0, for the overall F-test for this experiment would be that
         * all three levels of the factor produce the same response, on average. To calculate the F-ratio:
         * 
         * Step 1: Calculate the mean within each group:
         * 
         *    Y1 = ( 6 + 8 + 4 + 5 + 3 + 4 ) / 6       = 5
         *  Y2 = ( 8 + 12 + 9 + 11 + 6 + 8 ) / 6   = 9
         *  Y3 = ( 13 + 9 + 11 + 8 + 7 + 12 ) / 6   = 10
         * 
         * Step 2: Calculate the overall mean, Y:
         * 
         *    Y = (Y1 + Y2 + Y3) / 3 = 8.
         * 
         * Step 3: Calculate the "between-group" sum of squares:
         * 
         *  "between-group" sum of squares = n(Y1-Y)^2 + n(Y2-Y)^2 + n(Y3-Y)^2
         *                          = 6(5-8)^2 + 6(9-8)^2 + 6(9-8)^2
         *                          = 84
         * 
         * Step 4: The between-group degrees of freedom is one less than the number of groups.
         * 
         *   between-group degrees of freedom = a - 1
         *                             = 3-1
         *                             = 2
         *   
         * Step 5: The between-group mean square value is
         * 
         *  between-group mean square value = "between-group" sum of squares / between-group degrees of freedom
         *                          = 84/2
         *                          = 42
         *  
         * Step 6: Calculate the "within-group" sum of squares. Begin by centering the data in each group
         * 
         *       a1         a2         a3
         * 6 - 5 = 1   8 - 9 = -1   13 - 10 = 3
         * 8 - 5 = 3   12 - 9 = 3   9 - 10 = -1
         * 4 - 5 = -1   9 - 9 = 0   11 - 10 = 1
         * 5 - 5 = 0   11 - 9 = 2   8 - 10 = -2
         * 3 - 5 = -2   6 - 9 = -3   7 - 10 = -3
         * 4 - 5 = -1   8 - 9 = -1   12 - 10 = 2
         * 
         *    within-group sum of squares = 1^2 + 3^2 + (-1)^2 + 0^2 + (-2)^2 + (-1)^2 +
         *                           (-1)^2 + 3^2 + 0^2 + 2^2 + (-3)^2 + (-1)^2 +
         *                           3^2 + (-1)^2 + 1^2 + (-2)^2 + (-3)^2 + 2^2
         * 
         *                         = 1 + 9 + 1 + 0 + 4 + 1 + 1 + 9 + 0 + 4 + 9 + 1 + 9 + 1 + 1 + 4 + 9 + 4
         *                         = 68
         * 
         * Step 7: The within-group degrees of freedom is 
         *  
         *  within-group degrees of freedom = a(n-1)
         *                          = 3(6-1)
         *                          = 15
         * 
         * Step 8: Thus the within-group mean square value is,
         * 
         *  within-group mean square value = within-group sum of squares / within-group degrees of freedom
         *                          = 68 / 15
         *                          = 4.5
         * Step 9: The F-ratio is
         * 
         *  F-ratio = between-group mean square value /  within-group mean square value
         *        = 42/4.5
         *        = 9.3
         *  
         *  The critical value is the number that the test statistic must exceed to reject the test.
         *  In this case, Fcrit(2,15) = 3.68 at alpha = 0.05. Since F = 9.3 > 3.68, the results are
         *  significant at the 5% significance level. One would reject the null hypothesis, concluding
         *  that there is strong evidence that the expected values in the three groups differ. 
         *  The p-value for this test is 0.002.
         */

        /**
         * ANOVA Variables:
         * 
         * a         =   Number of distinct test groups (corresponds to number of input files).
         * 
         * n         =   Number of data items per test group (corresponds to data items in a test block).
         * 
         * overallMeans   =   An array which stores the means for each metric recorded in a test block.
         * 
         * sumSquaresBetweenGroup   =   the "between-group" sum of squares.
         * 
         * freedomBetweenGroup      =   The between-group degrees of freedom is one less than the number of groups.
         * 
         * meanSquareBetweenGroup   =   Stores the between-group mean square values.
         * 
         * sumSquaresWithinGroup   =   The within-group sum of squares is the sum of squares.
         * 
         * freedomWithinGroup      =   The within-group degrees of freedom is.
         * 
         * meanSquareWithinGroup   =   Stores the within-group mean square values.
         * 
         * F_Ratios               =   The F-ratio's.
         */

        int a = vector.size();// Number of groups.
        int n = vector.elementAt(0).getRows();// Number of data values per group.

        // Number of recorded metrics per test (number of variables).
        int metrics = vector.elementAt(0).getColumns();

        double[] overallMeans = new double[metrics];
        double[] sumSquaresBetweenGroup = new double[metrics];
        double[] meanSquareBetweenGroup = new double[metrics];
        double[] sumSquaresWithinGroup = new double[metrics];
        double[] meanSquareWithinGroup = new double[metrics];
        double[] F_Ratios = new double[metrics];

        //STEP 1. Calculate the overall means.
        for (int i = 0; i < vector.size(); i++)
            for (int j = 0; j < vector.elementAt(0).getColumns(); j++)
                overallMeans[j] += vector.elementAt(i).getMean(j);

        //STEP 2. Divide the overall means by the number of groups.
        for (int j = 0; j < overallMeans.length; j++)
            overallMeans[j] = overallMeans[j] / (double) vector.size();

        //STEP 3.  Calculate the "between-group" sum of squares:
        for (int i = 0; i < vector.size(); i++)
            for (int j = 0; j < vector.elementAt(0).getColumns(); j++)
                sumSquaresBetweenGroup[j] += (double) n
                        * (Math.pow((vector.elementAt(i).getMean(j) - overallMeans[j]), 2));

        //STEP 4: The between-group degrees of freedom
        double freedomBetweenGroup = a - 1;

        //STEP 5. between-group mean square value
        for (int i = 0; i < meanSquareBetweenGroup.length; i++)
            meanSquareBetweenGroup[i] = sumSquaresBetweenGroup[i] / freedomBetweenGroup;

        //STEP 6. Sum of centered squares (partly already calculated by ANOVA objects.
        for (int i = 0; i < vector.size(); i++)
            for (int j = 0; j < vector.elementAt(0).getColumns(); j++)
                sumSquaresWithinGroup[j] += vector.elementAt(i).getSumCentredSquares(j);

        //STEP 7.
        double freedomWithinGroup = (double) a * (n - 1);

        //STEP 8. The within-group mean square value is...
        for (int i = 0; i < meanSquareWithinGroup.length; i++)
            meanSquareWithinGroup[i] = sumSquaresWithinGroup[i] / freedomWithinGroup;

        // STEP 9. The final F-ratios are...
        for (int i = 0; i < F_Ratios.length; i++)
            F_Ratios[i] = meanSquareBetweenGroup[i] / meanSquareWithinGroup[i];

        Writer.append(outputPath, key + ",");

        for (int i = 0; i < F_Ratios.length; i++) {
            // The p-value is the probability of obtaining a test statistic,
            // at least as extreme as the one that was actually observed, 
            // assuming that the null hypothesis is true.
            FDistribution fdist = new FDistribution(freedomBetweenGroup, freedomWithinGroup);

            double pValue = (1.0 - fdist.cumulativeProbability(F_Ratios[i]));

            // headers[i]+",F-ratio,P-value,Fb,Fw,H0 (1=rejected & 0=accepted), alpha="+alpha+","
            if (pValue < alpha)
                Writer.append(outputPath, "," + F_Ratios[i] + "," + pValue + "," + freedomBetweenGroup + ","
                        + freedomWithinGroup + "," + "1,,");
            else
                Writer.append(outputPath, "," + F_Ratios[i] + "," + pValue + "," + freedomBetweenGroup + ","
                        + freedomWithinGroup + "," + "0,,");
        }

        Writer.append(outputPath, "\n");

        /**
         * TUKEY TEST
         * 
         * Now we have established the ANOVA results, that is we know the significance of the variance
         * between the individual test results. But knowing that there is a significant difference is not
         * enough. We need to know which test results were better and which were worse in order to determine
         * which algorithm performed better. To do this we need to perform the Tukey test. It performs a pair
         * wise comparison of the results so that they can be ranked.
         * 
         * The Studentized range statistic can then be calculated for any particular pair as:
         * 
         *    Q = ( ML  MS ) / sqrt( meanSquareWithinGroup / values per sample)
         * 
         *  and ML is the largest mean for a group, and MS is the smallest mean for a group.
         */

        // PAIRWISE COMPARISON
        for (int i = 0; i < vector.size(); i++) {
            for (int j = i + 1; j < vector.size(); j++) {
                // Here the comparison is performed. Remember we must do the Tukey test
                // on each metric. So we will calculate the HSD (Honestly Significant Difference)
                // multiple times.

                // For each metric
                for (int k = 0; k < vector.elementAt(i).getColumns(); k++) {
                    double mean_one = vector.elementAt(i).getMean(k);
                    double mean_two = vector.elementAt(j).getMean(k);
                    double meanSquaredWithinGroup = meanSquareWithinGroup[k];
                    double valuesPerSample = vector.elementAt(i).getRows();// All objects have same number of rows here.

                    double Q = 0;

                    // This is a string used to summarize the outcome of the test.
                    String outcome = vector.elementAt(i).getFileName() + " - "
                            + vector.elementAt(j).getFileName() + " +";

                    if (Double.compare(mean_one, mean_two) < 0) // mean_one < mean_two
                    {
                        Q = (mean_two - mean_one) / Math.sqrt(meanSquaredWithinGroup / valuesPerSample);
                        outcome = outcome.replace("-", " < ");
                    } else if (Double.compare(mean_one, mean_two) > 0) // mean_one > mean_two
                    {
                        Q = (mean_one - mean_two) / Math.sqrt(meanSquaredWithinGroup / valuesPerSample);
                        outcome = outcome.replace("-", " > ");
                    }

                    String H0Result = ""; // 1=rejected & 0=accepted
                    double QDist = getQDist(freedomWithinGroup, a, alpha);

                    if (Double.compare(Q, QDist) < 0) {
                        H0Result = "0";
                        outcome = outcome.replace("+", "H0 Accepted");
                    } else if (Double.compare(Q, QDist) > 0) {
                        H0Result = "1";
                        outcome = outcome.replace("+", "H0 Rejected");
                    } else {
                        H0Result = "-1";
                        outcome = outcome.replace("+", "H0 Accepted");
                    }

                    Writer.append(tukeyOutputPath,
                            vector.elementAt(i).getFileName() + "," + vector.elementAt(j).getFileName() + ","
                                    + key + "," + headers[k + 1] + "," + meanSquaredWithinGroup + ","
                                    + freedomWithinGroup + "," + valuesPerSample + "," + alpha + "," + Q + ","
                                    + H0Result + "," + outcome + "\n");
                }

                Writer.append(tukeyOutputPath, ",,,,\n");
            }

            Writer.append(tukeyOutputPath, ",,,,\n");
        }

        //System.out.println("\n\n");
    }
}

From source file:com.rapidminer.operator.learner.functions.linear.TTestLinearRegressionMethod.java

/**
 * Returns the PValue of the attributeIndex-th attribute that expresses the probability that the
 * coefficient is only random./*from  www . j  a  v  a 2  s . co  m*/
 *
 * @throws ProcessStoppedException
 */
protected double getPValue(double coefficient, int attributeIndex, LinearRegression regression, boolean useBias,
        double ridge, ExampleSet exampleSet, boolean[] isUsedAttribute, double[] standardDeviations,
        double labelStandardDeviation, FDistribution fdistribution, double generalCorrelation)
        throws UndefinedParameterError, ProcessStoppedException {
    double tolerance = regression.getTolerance(exampleSet, isUsedAttribute, attributeIndex, ridge, useBias);
    double standardError = Math
            .sqrt((1.0d - generalCorrelation)
                    / (tolerance * (exampleSet.size() - exampleSet.getAttributes().size() - 1.0d)))
            * labelStandardDeviation / standardDeviations[attributeIndex];

    // calculating other statistics
    double tStatistics = coefficient / standardError;
    double probability = fdistribution.cumulativeProbability(tStatistics * tStatistics);
    return probability;
}

From source file:edu.cudenver.bios.power.glmm.NonCentralityDistribution.java

/**
 * Calculate the probability P(W < w), where W follows the distribution of
 * the non-centrality parameter//from w  w w .ja v a2  s.co  m
 *
 * @param w critical point for which to calculate cumulative probability
 * @return P(W < w)
 */
public double cdf(double w) throws PowerException {
    if (H1 <= 0 || w <= H0)
        return 0;
    if (H1 - w <= 0)
        return 1;
    ArrayList<ChiSquareTerm> chiSquareTerms = new ArrayList<ChiSquareTerm>();

    try {
        double b0 = 1 - w / H1;
        double m1Positive = 0;
        double m1Negative = 0;
        double m2Positive = 0;
        double m2Negative = 0;
        //
        int numPositive = 0;
        int numNegative = 0;
        double nu;
        double delta;
        double lambda;
        double lastPositiveNoncentrality = 0; // for special cases
        double lastNegativeNoncentrality = 0; // for special cases

        // add in the first chi-squared term in the estimate of the non-centrality
        // (expressed as a sum of weighted chi-squared r.v.s)
        // initial chi-square term is central (delta=0) with N-qf df, and lambda = b0
        nu = N - qF;
        lambda = b0;
        delta = 0;
        chiSquareTerms.add(new ChiSquareTerm(lambda, nu, delta));
        // accumulate terms
        if (lambda > 0) {
            // positive terms
            numPositive++;
            lastPositiveNoncentrality = delta;
            m1Positive += lambda * (nu + delta);
            m2Positive += lambda * lambda * 2 * (nu + 2 * delta);
        } else if (lambda < 0) {
            // negative terms - we take absolute value of lambda where needed
            numNegative++;
            lastNegativeNoncentrality = delta;
            m1Negative += -1 * lambda * (nu + delta);
            m2Negative += lambda * lambda * 2 * (nu + 2 * delta);
        }

        // accumulate the remaining terms
        for (int k = 0; k < sStar; k++) {
            if (k < sStar) {
                // for k = 1 (well, 0 in java array terms and 1 in the paper) to sStar, chi-square term is
                // non-central (delta = mz^2), 1 df, lambda = (b0 - kth eigen value of S)
                nu = 1;
                lambda = b0 - sEigenValues[k];
                delta = mzSq.getEntry(k, 0);
                chiSquareTerms.add(new ChiSquareTerm(lambda, nu, delta));
            } else {
                // for k = sStar+1 to a, chi-sqaure term is non-central (delta = mz^2), 1 df,
                // lambda = b0
                nu = 1;
                lambda = b0;
                delta = mzSq.getEntry(k, 0);
                chiSquareTerms.add(new ChiSquareTerm(lambda, nu, delta));
            }
            // accumulate terms
            if (lambda > 0) {
                // positive terms
                numPositive++;
                lastPositiveNoncentrality = delta;
                m1Positive += lambda * (nu + delta);
                m2Positive += lambda * lambda * 2 * (nu + 2 * delta);
            } else if (lambda < 0) {
                // negative terms - we take absolute value of lambda where needed
                numNegative++;
                lastNegativeNoncentrality = delta;
                m1Negative += -1 * lambda * (nu + delta);
                m2Negative += lambda * lambda * 2 * (nu + 2 * delta);
            }
            // Note, we deliberately ignore terms for which lambda == 0
        }

        // handle special cases
        if (numNegative == 0)
            return 0;
        if (numPositive == 0)
            return 1;

        // special cases
        if (numNegative == 1 && numPositive == 1) {
            double Nstar = N - qF + a - 1;
            double Fstar = w / (Nstar * (H1 - w));
            if (lastPositiveNoncentrality >= 0 && lastNegativeNoncentrality == 0) {
                // handle special case: CGaussian = 0, s* = 1
                NonCentralFDistribution nonCentralFDist = new NonCentralFDistribution(Nstar, 1,
                        lastPositiveNoncentrality);
                return nonCentralFDist.cdf(Fstar);
            } else if (lastPositiveNoncentrality == 0 && lastNegativeNoncentrality > 0) {
                // handle special case: CGaussian = 1
                NonCentralFDistribution nonCentralFDist = new NonCentralFDistribution(1, Nstar,
                        lastNegativeNoncentrality);
                return 1 - nonCentralFDist.cdf(1 / Fstar);
            }
        }

        if (exact) {
            WeightedSumOfNoncentralChiSquaresDistribution dist = new WeightedSumOfNoncentralChiSquaresDistribution(
                    chiSquareTerms, ACCURACY);
            return dist.cdf(0);
        } else {
            // handle general case - Satterthwaite approximation
            double nuStarPositive = 2 * (m1Positive * m1Positive) / m2Positive;
            double nuStarNegative = 2 * (m1Negative * m1Negative) / m2Negative;
            double lambdaStarPositive = m2Positive / (2 * m1Positive);
            double lambdaStarNegative = m2Negative / (2 * m1Negative);

            // create a central F to approximate the distribution of the non-centrality parameter
            FDistribution centralFDist = new FDistribution(nuStarPositive, nuStarNegative);
            // return power based on the non-central F
            return centralFDist.cumulativeProbability(
                    (nuStarNegative * lambdaStarNegative) / (nuStarPositive * lambdaStarPositive));
        }
    } catch (RuntimeException e) {
        LOGGER.warn("exiting cdf abnormally", e);

        throw new PowerException(e.getMessage(),
                PowerErrorEnum.DISTRIBUTION_NONCENTRALITY_PARAMETER_CDF_FAILED);
    }
}

From source file:hms.hwestra.interactionrebuttal2.InteractionRebuttal2.java

private void iterativelyIncreaseNumberOfPCsInCellCountPredictionModel(String pcFile, String cellcountFile,
        String pheno) throws IOException {

    DoubleMatrixDataset<String, String> pcs = new DoubleMatrixDataset<String, String>(pcFile); // samples on rows, pcs on cols?
    DoubleMatrixDataset<String, String> cellcounts = new DoubleMatrixDataset<String, String>(cellcountFile); // samples on rows, celltype on cols

    Integer phenoId = cellcounts.hashCols.get(pheno);

    boolean[] includeRow = new boolean[pcs.nrRows];
    int shared = 0;
    for (int i = 0; i < pcs.nrRows; i++) {
        String sample = pcs.rowObjects.get(i);
        if (cellcounts.hashRows.containsKey(sample)) {
            shared++;/*from  w  w  w  .jav  a  2s . co m*/
            includeRow[i] = true;
        }
    }

    // order the samples of the cell count in the order of the pcs
    double[] olsY = new double[shared]; //Ordinary least squares: cell count
    int ctr = 0;
    for (int i = 0; i < pcs.nrRows; i++) {
        String sample = pcs.rowObjects.get(i);
        Integer sampleId = cellcounts.hashRows.get(sample);
        if (sampleId != null) {
            olsY[ctr] = cellcounts.rawData[sampleId][phenoId];
            ctr++;
        }
    }

    org.apache.commons.math3.distribution.FDistribution fDist = null;
    cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = null;
    cern.jet.random.tdouble.StudentT tDistColt = null;

    OLSMultipleLinearRegression previousFullModel = null;

    for (int col = 0; col < pcs.nrCols; col++) {
        OLSMultipleLinearRegression regressionFullModel = new OLSMultipleLinearRegression();
        OLSMultipleLinearRegression regressionOrigModel = new OLSMultipleLinearRegression();

        int nrPcs = col + 1;
        double[][] olsX = new double[shared][nrPcs];
        double[][] olsXN = new double[shared][1];
        for (int inc = 0; inc < col + 1; inc++) {
            ctr = 0;
            for (int i = 0; i < pcs.nrRows; i++) {
                if (includeRow[i]) {
                    olsX[ctr][inc] = pcs.rawData[i][inc];
                    ctr++;
                }
            }
        }

        double[] pc = new double[shared];
        ctr = 0;
        for (int i = 0; i < pcs.nrRows; i++) {
            if (includeRow[i]) {
                pc[ctr] = pcs.rawData[i][col];
                olsXN[ctr][0] = pcs.rawData[i][0];
                ctr++;
            }
        }

        double corr = JSci.maths.ArrayMath.correlation(pc, olsY);
        Correlation.correlationToZScore(olsY.length);
        double z = Correlation.convertCorrelationToZScore(olsY.length, corr);
        double p = ZScores.zToP(z);

        regressionFullModel.newSampleData(olsY, olsX);
        regressionOrigModel.newSampleData(olsY, olsXN);

        double rsquaredadj = regressionFullModel.calculateAdjustedRSquared();
        double rsquared = regressionFullModel.calculateRSquared();

        double rse = regressionOrigModel.estimateRegressionStandardError();
        double rsefull = regressionFullModel.estimateRegressionStandardError();

        double rss1 = regressionOrigModel.calculateResidualSumOfSquares();
        double rss2 = regressionFullModel.calculateResidualSumOfSquares();
        double F = ((rss1 - rss2) / (3 - 2)) / (rss2 / (olsY.length - 3));
        int numParams1 = 1; // regressor + intercept
        int numParams2 = nrPcs; // regressors + intercept
        if (nrPcs > 1) {

            double F2 = ((rss1 - rss2) / (numParams2 - numParams1)) / (rss2 / (olsY.length - numParams2));

            double rss3 = previousFullModel.calculateResidualSumOfSquares();
            int numParams3 = nrPcs - 1;
            double FPrevious = ((rss3 - rss2) / (numParams2 - numParams3))
                    / (rss2 / (olsY.length - numParams2));

            // pf(f, m1$df.residual-m2$df.residual, m2$df.residual, lower.tail = FALSE)
            // (double numeratorDegreesOfFreedom, double denominatorDegreesOfFreedom)
            fDist = new org.apache.commons.math3.distribution.FDistribution((numParams2 - numParams1),
                    olsY.length - numParams2);
            FDistribution fDistPrev = new FDistribution((numParams2 - numParams3), olsY.length - numParams2);

            double anovaFTestP = -1;
            double anovaFTestP2 = -1;
            try {
                anovaFTestP = 1 - fDist.cumulativeProbability(F2);
                anovaFTestP2 = 1 - fDist.cumulativeProbability(FPrevious);
                if (anovaFTestP < 1E-160) {
                    anovaFTestP = 1E-16;
                }

                if (anovaFTestP2 < 1E-160) {
                    anovaFTestP2 = 1E-16;
                }
            } catch (Exception err) {
            }

            System.out.println(nrPcs + "\t" + corr + "\t" + z + "\t" + p + "\t" + rsquared + "\t" + numParams2
                    + "\t" + F2 + "\t" + FPrevious + "\t" + anovaFTestP + "\t" + anovaFTestP2);
        } else {
            System.out.println(nrPcs + "\t" + corr + "\t" + z + "\t" + p + "\t" + rsquared + "\t" + numParams1);
        }

        previousFullModel = regressionFullModel;

    }

    ArrayList<String> colNames = new ArrayList<String>();
    colNames.add("CellCount");
    double[][] data = new double[shared][pcs.nrCols + 1];
    for (int i = 0; i < olsY.length; i++) {
        data[i][0] = olsY[i];
    }

    ArrayList<String> rowNames = new ArrayList<String>();
    for (int col = 0; col < pcs.nrCols; col++) {
        ctr = 0;
        colNames.add(pcs.colObjects.get(col));
        for (int row = 0; row < pcs.nrRows; row++) {
            if (includeRow[row]) {
                data[ctr][col + 1] = pcs.rawData[row][col];
                ctr++;
            }

        }
    }

    for (int row = 0; row < pcs.nrRows; row++) {
        if (includeRow[row]) {
            rowNames.add("Sample_" + pcs.rowObjects.get(row));
        }
    }

    DoubleMatrixDataset<String, String> dsout = new DoubleMatrixDataset<String, String>();
    dsout.rawData = data;
    dsout.rowObjects = rowNames;
    dsout.colObjects = colNames;
    dsout.recalculateHashMaps();
    dsout.save(pcFile + "-mergedWCellCount.txt");

}

From source file:org.cirdles.calamari.algorithms.WeightedMeanCalculators.java

public static WeightedLinearCorrResults weightedLinearCorr(double[] y, double[] x, double[][] sigmaRhoY) {
    WeightedLinearCorrResults weightedLinearCorrResults = new WeightedLinearCorrResults();

    RealMatrix omega = new BlockRealMatrix(convertCorrelationsToCovariances(sigmaRhoY));
    RealMatrix invOmega = MatrixUtils.inverse(omega);
    int n = y.length;

    double mX = 0;
    double pX = 0;
    double pY = 0;
    double pXY = 0;
    double w = 0;

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            double invOm = invOmega.getEntry(i, j);
            w += invOm;//from   w  w  w . jav a  2  s .c  om
            pX += (invOm * (x[i] + x[j]));
            pY += (invOm * (y[i] + y[j]));
            pXY += (invOm * (((x[i] * y[j]) + (x[j] * y[i]))));
            mX += (invOm * x[i] * x[j]);
        }
    }
    double slope = ((2 * pXY * w) - (pX * pY)) / ((4 * mX * w) - (pX * pX));
    double intercept = (pY - (slope * pX)) / (2 * w);

    RealMatrix fischer = new BlockRealMatrix(new double[][] { { mX, pX / 2.0 }, { pX / 2.0, w } });
    RealMatrix fischerInv = MatrixUtils.inverse(fischer);

    double slopeSig = StrictMath.sqrt(fischerInv.getEntry(0, 0));
    double interceptSig = StrictMath.sqrt(fischerInv.getEntry(1, 1));
    double slopeInterceptCov = fischerInv.getEntry(0, 1);

    RealMatrix resid = new BlockRealMatrix(n, 1);
    for (int i = 0; i < n; i++) {
        resid.setEntry(i, 0, y[i] - (slope * x[i]) - intercept);
    }

    RealMatrix residT = resid.transpose();
    RealMatrix mM = residT.multiply(invOmega).multiply(resid);

    double sumSqWtdResids = mM.getEntry(0, 0);
    double mswd = sumSqWtdResids / (n - 2);

    // http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/distribution/FDistribution.html
    FDistribution fdist = new org.apache.commons.math3.distribution.FDistribution((n - 2), 1E9);
    double prob = 1.0 - fdist.cumulativeProbability(mswd);

    weightedLinearCorrResults.setBad(false);
    weightedLinearCorrResults.setSlope(slope);
    weightedLinearCorrResults.setIntercept(intercept);
    weightedLinearCorrResults.setSlopeSig(slopeSig);
    weightedLinearCorrResults.setInterceptSig(interceptSig);
    weightedLinearCorrResults.setSlopeInterceptCov(slopeInterceptCov);
    weightedLinearCorrResults.setMswd(mswd);
    weightedLinearCorrResults.setProb(prob);

    return weightedLinearCorrResults;
}

From source file:org.cirdles.calamari.algorithms.WeightedMeanCalculators.java

public static WtdAvCorrResults wtdAvCorr(double[] values, double[][] varCov) {
    // assume varCov is variance-covariance matrix (i.e. SigRho = false)

    WtdAvCorrResults results = new WtdAvCorrResults();

    int n = varCov.length;
    RealMatrix omegaInv = new BlockRealMatrix(varCov);
    RealMatrix omega = MatrixUtils.inverse(omegaInv);

    double numer = 0.0;
    double denom = 0.0;

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            numer += (values[i] + values[j]) * omega.getEntry(i, j);
            denom += omega.getEntry(i, j);
        }//  w  ww.j  ava 2  s  .  c  o  m
    }

    // test denom
    if (denom > 0.0) {
        double meanVal = numer / denom / 2.0;
        double meanValSigma = StrictMath.sqrt(1.0 / denom);

        double[][] unwtdResidsArray = new double[n][1];
        for (int i = 0; i < n; i++) {
            unwtdResidsArray[i][0] = values[i] - meanVal;
        }

        RealMatrix unwtdResids = new BlockRealMatrix(unwtdResidsArray);
        RealMatrix transUnwtdResids = unwtdResids.transpose();
        RealMatrix product = transUnwtdResids.multiply(omega);
        RealMatrix sumWtdResids = product.multiply(unwtdResids);

        double mswd = 0.0;
        double prob = 0.0;
        if (n > 1) {
            mswd = sumWtdResids.getEntry(0, 0) / (n - 1);

            // http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/distribution/FDistribution.html
            FDistribution fdist = new org.apache.commons.math3.distribution.FDistribution((n - 1), 1E9);
            prob = 1.0 - fdist.cumulativeProbability(mswd);
        }

        results.setBad(false);
        results.setMeanVal(meanVal);
        results.setSigmaMeanVal(meanValSigma);
        results.setMswd(mswd);
        results.setProb(prob);
    }

    return results;

}