List of usage examples for org.apache.commons.math3.distribution FDistribution cumulativeProbability
public double cumulativeProbability(double x)
From source file:dase.timeseries.analysis.GrangerTest.java
/** * Returns p-value for Granger causality test. * * @param y/*from w ww. ja va 2 s .co m*/ * - predictable variable * @param x * - predictor * @param L * - lag, should be 1 or greater. * @return p-value of Granger causality */ public static double granger(double[] y, double[] x, int L) { OLSMultipleLinearRegression h0 = new OLSMultipleLinearRegression(); OLSMultipleLinearRegression h1 = new OLSMultipleLinearRegression(); double[][] laggedY = createLaggedSide(L, y); double[][] laggedXY = createLaggedSide(L, x, y); int n = laggedY.length; h0.newSampleData(strip(L, y), laggedY); h1.newSampleData(strip(L, y), laggedXY); double rs0[] = h0.estimateResiduals(); double rs1[] = h1.estimateResiduals(); double RSS0 = sqrSum(rs0); double RSS1 = sqrSum(rs1); double ftest = ((RSS0 - RSS1) / L) / (RSS1 / (n - 2 * L - 1)); System.out.println(RSS0 + " " + RSS1); System.out.println("F-test " + ftest); FDistribution fDist = new FDistribution(L, n - 2 * L - 1); double pValue = 1.0 - fDist.cumulativeProbability(ftest); System.out.println("P-value " + pValue); return pValue; }
From source file:com.rapidminer.operator.validation.significance.TTestSignificanceTestOperator.java
private double getProbability(PerformanceCriterion pc1, PerformanceCriterion pc2) { double totalDeviation = ((pc1.getAverageCount() - 1) * pc1.getVariance() + (pc2.getAverageCount() - 1) * pc2.getVariance()) / (pc1.getAverageCount() + pc2.getAverageCount() - 2); double factor = 1.0d / (1.0d / pc1.getAverageCount() + 1.0d / pc2.getAverageCount()); double diff = pc1.getAverage() - pc2.getAverage(); double t = factor * diff * diff / totalDeviation; int secondDegreeOfFreedom = pc1.getAverageCount() + pc2.getAverageCount() - 2; double prob;/* www .ja v a 2s.com*/ // make sure the F-distribution is well defined if (secondDegreeOfFreedom > 0) { FDistribution fDist = new FDistribution(1, secondDegreeOfFreedom); prob = 1 - fDist.cumulativeProbability(t); } else { // in this case the probability cannot calculated correctly and a 1 is returned, as // this result is not significant prob = 1; } return prob; }
From source file:gedi.util.math.stat.testing.OneWayAnova.java
public double getPvalue() { FDistribution fdist = new FDistribution(getFactorDof(), getErrorDof()); if (getFactorDof() < 1 || getErrorDof() < 1) return Double.NaN; return 1 - fdist.cumulativeProbability(getFStatistic()); }
From source file:edu.cudenver.bios.power.glmm.GLMMTest.java
/** * Fit the model// w w w .j a va2 s. c o m * @return model fit object */ public ModelFit getModelFit() { double fobs = getObservedF(GLMMTest.DistributionType.DATA_ANALYSIS_NULL); // get the p-value from a central F distribution double ndf = getNumeratorDF(GLMMTest.DistributionType.DATA_ANALYSIS_NULL); if (Double.isNaN(ndf)) { throw new IllegalArgumentException("numerator DF is NaN"); } double ddf = getDenominatorDF(GLMMTest.DistributionType.DATA_ANALYSIS_NULL); if (Double.isNaN(ddf)) { throw new IllegalArgumentException("denominator DF is NaN"); } FDistribution fdist = new FDistribution(ndf, ddf); double pvalue = 1 - fdist.cumulativeProbability(fobs); return new ModelFit(pvalue, fobs, ndf, ddf, sigmaError, beta); }
From source file:cs.man.ac.uk.stats.ComputeANOVAStats.java
/** * Performs an ANOVA analysis on the data read in. * /*from w ww . ja v a 2s . co m*/ * @param outputPath the path to output details of the ANOVA analysis to. */ private static void ANOVAAnalysis(String outputPath) { /** * OUTPUT FILE PREPARATION */ // Clear up output path. Common.fileDelete(outputPath); String tukeyOutputPath = outputPath.replace(".csv", "_HSD.csv"); Common.fileDelete(tukeyOutputPath); Writer.append(tukeyOutputPath, "Result 1,Result 2,Test,Metric,MSwg,DFwg,n,alpha,HSD,H0 (1=rejected & 0=accepted),Outcome\n"); // Write header information to output path String[] headers = header.split(","); Writer.append(outputPath, headers[0] + ","); for (int i = 1; i < headers.length; i++) Writer.append(outputPath, headers[i] + ",F-ratio,P-value,Fb,Fw,H0 (1=rejected & 0=accepted), alpha=" + alpha + ","); Writer.append(outputPath, "\n"); /** * PERFROM ANOVA */ for (Map.Entry<String, Vector<ANOVA>> entry : anovaObjects.entrySet()) { String key = entry.getKey(); Vector<ANOVA> vector = entry.getValue(); /** * OK, its crucial to understand what is going on here. We have a number of files * containing results of algorithm tests. Each File will contain the results of * a number of different tests using different parameters. Note that each files contains * results for a specific algorithm only. * * Now if we want to perform ANOVA analysis on results from multiple algorithms, we * must analyze the results in these files together rather than in isolation. So we have * the following situation: n files, containing results of m tests, occurring in the same order in each * file. These are directly comparable results, for instance: * * FILE 1 FILE 2 FILE N * Test 1 Test 1 Test 1 -| * Test 1 Test 1 Test 1 | * Test 1 Test 1 Test 1 |---> Test Block 1 (i.e. multiple runs of same test) * Test 1 Test 1 Test 1 | * Test 1 Test 1 Test 1 -| * * Test 2 Test 2 Test 2 * Test 2 Test 2 Test 2 * Test 2 Test 2 Test 2 * Test 2 Test 2 Test 2 * Test 2 Test 2 Test 2 * * Test n Test n Test n -| * Test n Test n Test n | * Test n Test n Test n |---> Test Block n * Test n Test n Test n | * Test n Test n Test n -| * * ** Note each test result is made up of a number of recorded metrics. For instance Test Block 1 in file 1 * would look something like (with four metrics recorded during testing lets say TP,TN,FP,FN): * * 120 , 6 , 5 , 3 -|---> Run 1 ---> * 118 , 7 , 6 , 4 | | * 122 , 8 , 7 , 5 | |---> Test Block 1. * 130 , 12 , 5 , 13 | | * 100 , 2 , 5 , 7 -|---> Run 5 ---> * * The results of each test are actually described in terms of k variables (typically k=16). These variables * include the true positives (TP), false positives (FP), accuracy, f-score etc. Thus to compare the results * we need to do ANOVA analysis using data occurring in each of the three files. However we can only compare * like with like. So in the example above we can only perform ANOVA on comparable test blocks. In * which case ANOVA would be performed on Test 1 data in files 1, 2 and 3, then Test 2 data and so on. * At no point would Test 1 data in any of the files be compared with Test 2 data for example. * * The code below does this. Using arrays we perform ANOVA on each metric * in the test blocks. Clearly this makes the code below somewhat complicated to understand! * I'm genuinely sorry for that, the main reason is because I may have to perform this type of * analysis many thousands of times. But I'll try to explain how it works. * * For each Test block, in each file, an ANOVA object is generated in code above ( in the process() method). * Each ANOVA object essentially contains a matrix of the data collected in a test block. These ANOVA objects * have methods that enable them to calculate the mean and sum of the values in their matrix. For instance, * Test 1 involves ten runs of the same test. For each test, lets say we collect 4 pieces of data, the number of * true positives, true negatives, false positives and false negatives. An ANOVA object for Test 1 for File 1 * will contain a matrix of this information, and calculate the means/sums of these four variables storing them in: * * private double sums[]; * private double means[]; * * So then, * * sums[0] contains the sum of true positives. * sums[1] contains the sum of true negatives. * sums[2] contains the sum of false positives. * sums[3] contains the sum of false negatives. * * And likewise for the means. * * When the process() method terminates we have a number of ANOVA objects stored in a TreeMap structure, * which groups comparable ANOVA objects by storing them in the same vector. * * Here then we begin iterating through this tree map, and calculate the F-ratio for comparable ANOVA objects. * This way we can calculate all the ANOVA results automatically, for every variable we have. */ /* * ANOVA WORKED EXAMPLE (credit Wikipedia!). * * Consider an experiment to study the effect of three different levels of a factor * on a response (e.g. three levels of a fertilizer on plant growth). If we had 6 observations * for each level, we could write the outcome of the experiment in a table like this, * where a1, a2, and a3 are the three levels of the factor being studied. * * a1 a2 a3 * 6 8 13 * 8 12 9 * 4 9 11 * 5 11 8 * 3 6 7 * 4 8 12 * * The null hypothesis, denoted H0, for the overall F-test for this experiment would be that * all three levels of the factor produce the same response, on average. To calculate the F-ratio: * * Step 1: Calculate the mean within each group: * * Y1 = ( 6 + 8 + 4 + 5 + 3 + 4 ) / 6 = 5 * Y2 = ( 8 + 12 + 9 + 11 + 6 + 8 ) / 6 = 9 * Y3 = ( 13 + 9 + 11 + 8 + 7 + 12 ) / 6 = 10 * * Step 2: Calculate the overall mean, Y: * * Y = (Y1 + Y2 + Y3) / 3 = 8. * * Step 3: Calculate the "between-group" sum of squares: * * "between-group" sum of squares = n(Y1-Y)^2 + n(Y2-Y)^2 + n(Y3-Y)^2 * = 6(5-8)^2 + 6(9-8)^2 + 6(9-8)^2 * = 84 * * Step 4: The between-group degrees of freedom is one less than the number of groups. * * between-group degrees of freedom = a - 1 * = 3-1 * = 2 * * Step 5: The between-group mean square value is * * between-group mean square value = "between-group" sum of squares / between-group degrees of freedom * = 84/2 * = 42 * * Step 6: Calculate the "within-group" sum of squares. Begin by centering the data in each group * * a1 a2 a3 * 6 - 5 = 1 8 - 9 = -1 13 - 10 = 3 * 8 - 5 = 3 12 - 9 = 3 9 - 10 = -1 * 4 - 5 = -1 9 - 9 = 0 11 - 10 = 1 * 5 - 5 = 0 11 - 9 = 2 8 - 10 = -2 * 3 - 5 = -2 6 - 9 = -3 7 - 10 = -3 * 4 - 5 = -1 8 - 9 = -1 12 - 10 = 2 * * within-group sum of squares = 1^2 + 3^2 + (-1)^2 + 0^2 + (-2)^2 + (-1)^2 + * (-1)^2 + 3^2 + 0^2 + 2^2 + (-3)^2 + (-1)^2 + * 3^2 + (-1)^2 + 1^2 + (-2)^2 + (-3)^2 + 2^2 * * = 1 + 9 + 1 + 0 + 4 + 1 + 1 + 9 + 0 + 4 + 9 + 1 + 9 + 1 + 1 + 4 + 9 + 4 * = 68 * * Step 7: The within-group degrees of freedom is * * within-group degrees of freedom = a(n-1) * = 3(6-1) * = 15 * * Step 8: Thus the within-group mean square value is, * * within-group mean square value = within-group sum of squares / within-group degrees of freedom * = 68 / 15 * = 4.5 * Step 9: The F-ratio is * * F-ratio = between-group mean square value / within-group mean square value * = 42/4.5 * = 9.3 * * The critical value is the number that the test statistic must exceed to reject the test. * In this case, Fcrit(2,15) = 3.68 at alpha = 0.05. Since F = 9.3 > 3.68, the results are * significant at the 5% significance level. One would reject the null hypothesis, concluding * that there is strong evidence that the expected values in the three groups differ. * The p-value for this test is 0.002. */ /** * ANOVA Variables: * * a = Number of distinct test groups (corresponds to number of input files). * * n = Number of data items per test group (corresponds to data items in a test block). * * overallMeans = An array which stores the means for each metric recorded in a test block. * * sumSquaresBetweenGroup = the "between-group" sum of squares. * * freedomBetweenGroup = The between-group degrees of freedom is one less than the number of groups. * * meanSquareBetweenGroup = Stores the between-group mean square values. * * sumSquaresWithinGroup = The within-group sum of squares is the sum of squares. * * freedomWithinGroup = The within-group degrees of freedom is. * * meanSquareWithinGroup = Stores the within-group mean square values. * * F_Ratios = The F-ratio's. */ int a = vector.size();// Number of groups. int n = vector.elementAt(0).getRows();// Number of data values per group. // Number of recorded metrics per test (number of variables). int metrics = vector.elementAt(0).getColumns(); double[] overallMeans = new double[metrics]; double[] sumSquaresBetweenGroup = new double[metrics]; double[] meanSquareBetweenGroup = new double[metrics]; double[] sumSquaresWithinGroup = new double[metrics]; double[] meanSquareWithinGroup = new double[metrics]; double[] F_Ratios = new double[metrics]; //STEP 1. Calculate the overall means. for (int i = 0; i < vector.size(); i++) for (int j = 0; j < vector.elementAt(0).getColumns(); j++) overallMeans[j] += vector.elementAt(i).getMean(j); //STEP 2. Divide the overall means by the number of groups. for (int j = 0; j < overallMeans.length; j++) overallMeans[j] = overallMeans[j] / (double) vector.size(); //STEP 3. Calculate the "between-group" sum of squares: for (int i = 0; i < vector.size(); i++) for (int j = 0; j < vector.elementAt(0).getColumns(); j++) sumSquaresBetweenGroup[j] += (double) n * (Math.pow((vector.elementAt(i).getMean(j) - overallMeans[j]), 2)); //STEP 4: The between-group degrees of freedom double freedomBetweenGroup = a - 1; //STEP 5. between-group mean square value for (int i = 0; i < meanSquareBetweenGroup.length; i++) meanSquareBetweenGroup[i] = sumSquaresBetweenGroup[i] / freedomBetweenGroup; //STEP 6. Sum of centered squares (partly already calculated by ANOVA objects. for (int i = 0; i < vector.size(); i++) for (int j = 0; j < vector.elementAt(0).getColumns(); j++) sumSquaresWithinGroup[j] += vector.elementAt(i).getSumCentredSquares(j); //STEP 7. double freedomWithinGroup = (double) a * (n - 1); //STEP 8. The within-group mean square value is... for (int i = 0; i < meanSquareWithinGroup.length; i++) meanSquareWithinGroup[i] = sumSquaresWithinGroup[i] / freedomWithinGroup; // STEP 9. The final F-ratios are... for (int i = 0; i < F_Ratios.length; i++) F_Ratios[i] = meanSquareBetweenGroup[i] / meanSquareWithinGroup[i]; Writer.append(outputPath, key + ","); for (int i = 0; i < F_Ratios.length; i++) { // The p-value is the probability of obtaining a test statistic, // at least as extreme as the one that was actually observed, // assuming that the null hypothesis is true. FDistribution fdist = new FDistribution(freedomBetweenGroup, freedomWithinGroup); double pValue = (1.0 - fdist.cumulativeProbability(F_Ratios[i])); // headers[i]+",F-ratio,P-value,Fb,Fw,H0 (1=rejected & 0=accepted), alpha="+alpha+"," if (pValue < alpha) Writer.append(outputPath, "," + F_Ratios[i] + "," + pValue + "," + freedomBetweenGroup + "," + freedomWithinGroup + "," + "1,,"); else Writer.append(outputPath, "," + F_Ratios[i] + "," + pValue + "," + freedomBetweenGroup + "," + freedomWithinGroup + "," + "0,,"); } Writer.append(outputPath, "\n"); /** * TUKEY TEST * * Now we have established the ANOVA results, that is we know the significance of the variance * between the individual test results. But knowing that there is a significant difference is not * enough. We need to know which test results were better and which were worse in order to determine * which algorithm performed better. To do this we need to perform the Tukey test. It performs a pair * wise comparison of the results so that they can be ranked. * * The Studentized range statistic can then be calculated for any particular pair as: * * Q = ( ML MS ) / sqrt( meanSquareWithinGroup / values per sample) * * and ML is the largest mean for a group, and MS is the smallest mean for a group. */ // PAIRWISE COMPARISON for (int i = 0; i < vector.size(); i++) { for (int j = i + 1; j < vector.size(); j++) { // Here the comparison is performed. Remember we must do the Tukey test // on each metric. So we will calculate the HSD (Honestly Significant Difference) // multiple times. // For each metric for (int k = 0; k < vector.elementAt(i).getColumns(); k++) { double mean_one = vector.elementAt(i).getMean(k); double mean_two = vector.elementAt(j).getMean(k); double meanSquaredWithinGroup = meanSquareWithinGroup[k]; double valuesPerSample = vector.elementAt(i).getRows();// All objects have same number of rows here. double Q = 0; // This is a string used to summarize the outcome of the test. String outcome = vector.elementAt(i).getFileName() + " - " + vector.elementAt(j).getFileName() + " +"; if (Double.compare(mean_one, mean_two) < 0) // mean_one < mean_two { Q = (mean_two - mean_one) / Math.sqrt(meanSquaredWithinGroup / valuesPerSample); outcome = outcome.replace("-", " < "); } else if (Double.compare(mean_one, mean_two) > 0) // mean_one > mean_two { Q = (mean_one - mean_two) / Math.sqrt(meanSquaredWithinGroup / valuesPerSample); outcome = outcome.replace("-", " > "); } String H0Result = ""; // 1=rejected & 0=accepted double QDist = getQDist(freedomWithinGroup, a, alpha); if (Double.compare(Q, QDist) < 0) { H0Result = "0"; outcome = outcome.replace("+", "H0 Accepted"); } else if (Double.compare(Q, QDist) > 0) { H0Result = "1"; outcome = outcome.replace("+", "H0 Rejected"); } else { H0Result = "-1"; outcome = outcome.replace("+", "H0 Accepted"); } Writer.append(tukeyOutputPath, vector.elementAt(i).getFileName() + "," + vector.elementAt(j).getFileName() + "," + key + "," + headers[k + 1] + "," + meanSquaredWithinGroup + "," + freedomWithinGroup + "," + valuesPerSample + "," + alpha + "," + Q + "," + H0Result + "," + outcome + "\n"); } Writer.append(tukeyOutputPath, ",,,,\n"); } Writer.append(tukeyOutputPath, ",,,,\n"); } //System.out.println("\n\n"); } }
From source file:com.rapidminer.operator.learner.functions.linear.TTestLinearRegressionMethod.java
/** * Returns the PValue of the attributeIndex-th attribute that expresses the probability that the * coefficient is only random./*from www . j a v a 2 s . co m*/ * * @throws ProcessStoppedException */ protected double getPValue(double coefficient, int attributeIndex, LinearRegression regression, boolean useBias, double ridge, ExampleSet exampleSet, boolean[] isUsedAttribute, double[] standardDeviations, double labelStandardDeviation, FDistribution fdistribution, double generalCorrelation) throws UndefinedParameterError, ProcessStoppedException { double tolerance = regression.getTolerance(exampleSet, isUsedAttribute, attributeIndex, ridge, useBias); double standardError = Math .sqrt((1.0d - generalCorrelation) / (tolerance * (exampleSet.size() - exampleSet.getAttributes().size() - 1.0d))) * labelStandardDeviation / standardDeviations[attributeIndex]; // calculating other statistics double tStatistics = coefficient / standardError; double probability = fdistribution.cumulativeProbability(tStatistics * tStatistics); return probability; }
From source file:edu.cudenver.bios.power.glmm.NonCentralityDistribution.java
/** * Calculate the probability P(W < w), where W follows the distribution of * the non-centrality parameter//from w w w .ja v a2 s.co m * * @param w critical point for which to calculate cumulative probability * @return P(W < w) */ public double cdf(double w) throws PowerException { if (H1 <= 0 || w <= H0) return 0; if (H1 - w <= 0) return 1; ArrayList<ChiSquareTerm> chiSquareTerms = new ArrayList<ChiSquareTerm>(); try { double b0 = 1 - w / H1; double m1Positive = 0; double m1Negative = 0; double m2Positive = 0; double m2Negative = 0; // int numPositive = 0; int numNegative = 0; double nu; double delta; double lambda; double lastPositiveNoncentrality = 0; // for special cases double lastNegativeNoncentrality = 0; // for special cases // add in the first chi-squared term in the estimate of the non-centrality // (expressed as a sum of weighted chi-squared r.v.s) // initial chi-square term is central (delta=0) with N-qf df, and lambda = b0 nu = N - qF; lambda = b0; delta = 0; chiSquareTerms.add(new ChiSquareTerm(lambda, nu, delta)); // accumulate terms if (lambda > 0) { // positive terms numPositive++; lastPositiveNoncentrality = delta; m1Positive += lambda * (nu + delta); m2Positive += lambda * lambda * 2 * (nu + 2 * delta); } else if (lambda < 0) { // negative terms - we take absolute value of lambda where needed numNegative++; lastNegativeNoncentrality = delta; m1Negative += -1 * lambda * (nu + delta); m2Negative += lambda * lambda * 2 * (nu + 2 * delta); } // accumulate the remaining terms for (int k = 0; k < sStar; k++) { if (k < sStar) { // for k = 1 (well, 0 in java array terms and 1 in the paper) to sStar, chi-square term is // non-central (delta = mz^2), 1 df, lambda = (b0 - kth eigen value of S) nu = 1; lambda = b0 - sEigenValues[k]; delta = mzSq.getEntry(k, 0); chiSquareTerms.add(new ChiSquareTerm(lambda, nu, delta)); } else { // for k = sStar+1 to a, chi-sqaure term is non-central (delta = mz^2), 1 df, // lambda = b0 nu = 1; lambda = b0; delta = mzSq.getEntry(k, 0); chiSquareTerms.add(new ChiSquareTerm(lambda, nu, delta)); } // accumulate terms if (lambda > 0) { // positive terms numPositive++; lastPositiveNoncentrality = delta; m1Positive += lambda * (nu + delta); m2Positive += lambda * lambda * 2 * (nu + 2 * delta); } else if (lambda < 0) { // negative terms - we take absolute value of lambda where needed numNegative++; lastNegativeNoncentrality = delta; m1Negative += -1 * lambda * (nu + delta); m2Negative += lambda * lambda * 2 * (nu + 2 * delta); } // Note, we deliberately ignore terms for which lambda == 0 } // handle special cases if (numNegative == 0) return 0; if (numPositive == 0) return 1; // special cases if (numNegative == 1 && numPositive == 1) { double Nstar = N - qF + a - 1; double Fstar = w / (Nstar * (H1 - w)); if (lastPositiveNoncentrality >= 0 && lastNegativeNoncentrality == 0) { // handle special case: CGaussian = 0, s* = 1 NonCentralFDistribution nonCentralFDist = new NonCentralFDistribution(Nstar, 1, lastPositiveNoncentrality); return nonCentralFDist.cdf(Fstar); } else if (lastPositiveNoncentrality == 0 && lastNegativeNoncentrality > 0) { // handle special case: CGaussian = 1 NonCentralFDistribution nonCentralFDist = new NonCentralFDistribution(1, Nstar, lastNegativeNoncentrality); return 1 - nonCentralFDist.cdf(1 / Fstar); } } if (exact) { WeightedSumOfNoncentralChiSquaresDistribution dist = new WeightedSumOfNoncentralChiSquaresDistribution( chiSquareTerms, ACCURACY); return dist.cdf(0); } else { // handle general case - Satterthwaite approximation double nuStarPositive = 2 * (m1Positive * m1Positive) / m2Positive; double nuStarNegative = 2 * (m1Negative * m1Negative) / m2Negative; double lambdaStarPositive = m2Positive / (2 * m1Positive); double lambdaStarNegative = m2Negative / (2 * m1Negative); // create a central F to approximate the distribution of the non-centrality parameter FDistribution centralFDist = new FDistribution(nuStarPositive, nuStarNegative); // return power based on the non-central F return centralFDist.cumulativeProbability( (nuStarNegative * lambdaStarNegative) / (nuStarPositive * lambdaStarPositive)); } } catch (RuntimeException e) { LOGGER.warn("exiting cdf abnormally", e); throw new PowerException(e.getMessage(), PowerErrorEnum.DISTRIBUTION_NONCENTRALITY_PARAMETER_CDF_FAILED); } }
From source file:hms.hwestra.interactionrebuttal2.InteractionRebuttal2.java
private void iterativelyIncreaseNumberOfPCsInCellCountPredictionModel(String pcFile, String cellcountFile, String pheno) throws IOException { DoubleMatrixDataset<String, String> pcs = new DoubleMatrixDataset<String, String>(pcFile); // samples on rows, pcs on cols? DoubleMatrixDataset<String, String> cellcounts = new DoubleMatrixDataset<String, String>(cellcountFile); // samples on rows, celltype on cols Integer phenoId = cellcounts.hashCols.get(pheno); boolean[] includeRow = new boolean[pcs.nrRows]; int shared = 0; for (int i = 0; i < pcs.nrRows; i++) { String sample = pcs.rowObjects.get(i); if (cellcounts.hashRows.containsKey(sample)) { shared++;/*from w w w .jav a 2s . co m*/ includeRow[i] = true; } } // order the samples of the cell count in the order of the pcs double[] olsY = new double[shared]; //Ordinary least squares: cell count int ctr = 0; for (int i = 0; i < pcs.nrRows; i++) { String sample = pcs.rowObjects.get(i); Integer sampleId = cellcounts.hashRows.get(sample); if (sampleId != null) { olsY[ctr] = cellcounts.rawData[sampleId][phenoId]; ctr++; } } org.apache.commons.math3.distribution.FDistribution fDist = null; cern.jet.random.tdouble.engine.DoubleRandomEngine randomEngine = null; cern.jet.random.tdouble.StudentT tDistColt = null; OLSMultipleLinearRegression previousFullModel = null; for (int col = 0; col < pcs.nrCols; col++) { OLSMultipleLinearRegression regressionFullModel = new OLSMultipleLinearRegression(); OLSMultipleLinearRegression regressionOrigModel = new OLSMultipleLinearRegression(); int nrPcs = col + 1; double[][] olsX = new double[shared][nrPcs]; double[][] olsXN = new double[shared][1]; for (int inc = 0; inc < col + 1; inc++) { ctr = 0; for (int i = 0; i < pcs.nrRows; i++) { if (includeRow[i]) { olsX[ctr][inc] = pcs.rawData[i][inc]; ctr++; } } } double[] pc = new double[shared]; ctr = 0; for (int i = 0; i < pcs.nrRows; i++) { if (includeRow[i]) { pc[ctr] = pcs.rawData[i][col]; olsXN[ctr][0] = pcs.rawData[i][0]; ctr++; } } double corr = JSci.maths.ArrayMath.correlation(pc, olsY); Correlation.correlationToZScore(olsY.length); double z = Correlation.convertCorrelationToZScore(olsY.length, corr); double p = ZScores.zToP(z); regressionFullModel.newSampleData(olsY, olsX); regressionOrigModel.newSampleData(olsY, olsXN); double rsquaredadj = regressionFullModel.calculateAdjustedRSquared(); double rsquared = regressionFullModel.calculateRSquared(); double rse = regressionOrigModel.estimateRegressionStandardError(); double rsefull = regressionFullModel.estimateRegressionStandardError(); double rss1 = regressionOrigModel.calculateResidualSumOfSquares(); double rss2 = regressionFullModel.calculateResidualSumOfSquares(); double F = ((rss1 - rss2) / (3 - 2)) / (rss2 / (olsY.length - 3)); int numParams1 = 1; // regressor + intercept int numParams2 = nrPcs; // regressors + intercept if (nrPcs > 1) { double F2 = ((rss1 - rss2) / (numParams2 - numParams1)) / (rss2 / (olsY.length - numParams2)); double rss3 = previousFullModel.calculateResidualSumOfSquares(); int numParams3 = nrPcs - 1; double FPrevious = ((rss3 - rss2) / (numParams2 - numParams3)) / (rss2 / (olsY.length - numParams2)); // pf(f, m1$df.residual-m2$df.residual, m2$df.residual, lower.tail = FALSE) // (double numeratorDegreesOfFreedom, double denominatorDegreesOfFreedom) fDist = new org.apache.commons.math3.distribution.FDistribution((numParams2 - numParams1), olsY.length - numParams2); FDistribution fDistPrev = new FDistribution((numParams2 - numParams3), olsY.length - numParams2); double anovaFTestP = -1; double anovaFTestP2 = -1; try { anovaFTestP = 1 - fDist.cumulativeProbability(F2); anovaFTestP2 = 1 - fDist.cumulativeProbability(FPrevious); if (anovaFTestP < 1E-160) { anovaFTestP = 1E-16; } if (anovaFTestP2 < 1E-160) { anovaFTestP2 = 1E-16; } } catch (Exception err) { } System.out.println(nrPcs + "\t" + corr + "\t" + z + "\t" + p + "\t" + rsquared + "\t" + numParams2 + "\t" + F2 + "\t" + FPrevious + "\t" + anovaFTestP + "\t" + anovaFTestP2); } else { System.out.println(nrPcs + "\t" + corr + "\t" + z + "\t" + p + "\t" + rsquared + "\t" + numParams1); } previousFullModel = regressionFullModel; } ArrayList<String> colNames = new ArrayList<String>(); colNames.add("CellCount"); double[][] data = new double[shared][pcs.nrCols + 1]; for (int i = 0; i < olsY.length; i++) { data[i][0] = olsY[i]; } ArrayList<String> rowNames = new ArrayList<String>(); for (int col = 0; col < pcs.nrCols; col++) { ctr = 0; colNames.add(pcs.colObjects.get(col)); for (int row = 0; row < pcs.nrRows; row++) { if (includeRow[row]) { data[ctr][col + 1] = pcs.rawData[row][col]; ctr++; } } } for (int row = 0; row < pcs.nrRows; row++) { if (includeRow[row]) { rowNames.add("Sample_" + pcs.rowObjects.get(row)); } } DoubleMatrixDataset<String, String> dsout = new DoubleMatrixDataset<String, String>(); dsout.rawData = data; dsout.rowObjects = rowNames; dsout.colObjects = colNames; dsout.recalculateHashMaps(); dsout.save(pcFile + "-mergedWCellCount.txt"); }
From source file:org.cirdles.calamari.algorithms.WeightedMeanCalculators.java
public static WeightedLinearCorrResults weightedLinearCorr(double[] y, double[] x, double[][] sigmaRhoY) { WeightedLinearCorrResults weightedLinearCorrResults = new WeightedLinearCorrResults(); RealMatrix omega = new BlockRealMatrix(convertCorrelationsToCovariances(sigmaRhoY)); RealMatrix invOmega = MatrixUtils.inverse(omega); int n = y.length; double mX = 0; double pX = 0; double pY = 0; double pXY = 0; double w = 0; for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { double invOm = invOmega.getEntry(i, j); w += invOm;//from w w w . jav a 2 s .c om pX += (invOm * (x[i] + x[j])); pY += (invOm * (y[i] + y[j])); pXY += (invOm * (((x[i] * y[j]) + (x[j] * y[i])))); mX += (invOm * x[i] * x[j]); } } double slope = ((2 * pXY * w) - (pX * pY)) / ((4 * mX * w) - (pX * pX)); double intercept = (pY - (slope * pX)) / (2 * w); RealMatrix fischer = new BlockRealMatrix(new double[][] { { mX, pX / 2.0 }, { pX / 2.0, w } }); RealMatrix fischerInv = MatrixUtils.inverse(fischer); double slopeSig = StrictMath.sqrt(fischerInv.getEntry(0, 0)); double interceptSig = StrictMath.sqrt(fischerInv.getEntry(1, 1)); double slopeInterceptCov = fischerInv.getEntry(0, 1); RealMatrix resid = new BlockRealMatrix(n, 1); for (int i = 0; i < n; i++) { resid.setEntry(i, 0, y[i] - (slope * x[i]) - intercept); } RealMatrix residT = resid.transpose(); RealMatrix mM = residT.multiply(invOmega).multiply(resid); double sumSqWtdResids = mM.getEntry(0, 0); double mswd = sumSqWtdResids / (n - 2); // http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/distribution/FDistribution.html FDistribution fdist = new org.apache.commons.math3.distribution.FDistribution((n - 2), 1E9); double prob = 1.0 - fdist.cumulativeProbability(mswd); weightedLinearCorrResults.setBad(false); weightedLinearCorrResults.setSlope(slope); weightedLinearCorrResults.setIntercept(intercept); weightedLinearCorrResults.setSlopeSig(slopeSig); weightedLinearCorrResults.setInterceptSig(interceptSig); weightedLinearCorrResults.setSlopeInterceptCov(slopeInterceptCov); weightedLinearCorrResults.setMswd(mswd); weightedLinearCorrResults.setProb(prob); return weightedLinearCorrResults; }
From source file:org.cirdles.calamari.algorithms.WeightedMeanCalculators.java
public static WtdAvCorrResults wtdAvCorr(double[] values, double[][] varCov) { // assume varCov is variance-covariance matrix (i.e. SigRho = false) WtdAvCorrResults results = new WtdAvCorrResults(); int n = varCov.length; RealMatrix omegaInv = new BlockRealMatrix(varCov); RealMatrix omega = MatrixUtils.inverse(omegaInv); double numer = 0.0; double denom = 0.0; for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { numer += (values[i] + values[j]) * omega.getEntry(i, j); denom += omega.getEntry(i, j); }// w ww.j ava 2 s . c o m } // test denom if (denom > 0.0) { double meanVal = numer / denom / 2.0; double meanValSigma = StrictMath.sqrt(1.0 / denom); double[][] unwtdResidsArray = new double[n][1]; for (int i = 0; i < n; i++) { unwtdResidsArray[i][0] = values[i] - meanVal; } RealMatrix unwtdResids = new BlockRealMatrix(unwtdResidsArray); RealMatrix transUnwtdResids = unwtdResids.transpose(); RealMatrix product = transUnwtdResids.multiply(omega); RealMatrix sumWtdResids = product.multiply(unwtdResids); double mswd = 0.0; double prob = 0.0; if (n > 1) { mswd = sumWtdResids.getEntry(0, 0) / (n - 1); // http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/distribution/FDistribution.html FDistribution fdist = new org.apache.commons.math3.distribution.FDistribution((n - 1), 1E9); prob = 1.0 - fdist.cumulativeProbability(mswd); } results.setBad(false); results.setMeanVal(meanVal); results.setSigmaMeanVal(meanValSigma); results.setMswd(mswd); results.setProb(prob); } return results; }