List of usage examples for org.apache.commons.math.stat.descriptive StatisticalSummary getMean
double getMean();
From source file:org.datacleaner.beans.NumberAnalyzer.java
@Override public NumberAnalyzerResult getResult() { CrosstabDimension measureDimension = new CrosstabDimension(DIMENSION_MEASURE); measureDimension.addCategory(MEASURE_ROW_COUNT); measureDimension.addCategory(MEASURE_NULL_COUNT); measureDimension.addCategory(MEASURE_HIGHEST_VALUE); measureDimension.addCategory(MEASURE_LOWEST_VALUE); measureDimension.addCategory(MEASURE_SUM); measureDimension.addCategory(MEASURE_MEAN); measureDimension.addCategory(MEASURE_GEOMETRIC_MEAN); measureDimension.addCategory(MEASURE_STANDARD_DEVIATION); measureDimension.addCategory(MEASURE_VARIANCE); measureDimension.addCategory(MEASURE_SECOND_MOMENT); measureDimension.addCategory(MEASURE_SUM_OF_SQUARES); if (descriptiveStatistics) { measureDimension.addCategory(MEASURE_MEDIAN); measureDimension.addCategory(MEASURE_PERCENTILE25); measureDimension.addCategory(MEASURE_PERCENTILE75); measureDimension.addCategory(MEASURE_SKEWNESS); measureDimension.addCategory(MEASURE_KURTOSIS); }// w w w . j a v a 2 s . co m CrosstabDimension columnDimension = new CrosstabDimension(DIMENSION_COLUMN); for (InputColumn<? extends Number> column : _columns) { columnDimension.addCategory(column.getName()); } Crosstab<Number> crosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension); for (InputColumn<? extends Number> column : _columns) { CrosstabNavigator<Number> nav = crosstab.navigate().where(columnDimension, column.getName()); NumberAnalyzerColumnDelegate delegate = _columnDelegates.get(column); StatisticalSummary s = delegate.getStatistics(); int nullCount = delegate.getNullCount(); nav.where(measureDimension, MEASURE_NULL_COUNT).put(nullCount); if (nullCount > 0) { addAttachment(nav, delegate.getNullAnnotation(), column); } int numRows = delegate.getNumRows(); nav.where(measureDimension, MEASURE_ROW_COUNT).put(numRows); long nonNullCount = s.getN(); if (nonNullCount > 0) { final double highestValue = s.getMax(); final double lowestValue = s.getMin(); final double sum = s.getSum(); final double mean = s.getMean(); final double standardDeviation = s.getStandardDeviation(); final double variance = s.getVariance(); final double geometricMean; final double secondMoment; final double sumOfSquares; if (descriptiveStatistics) { final DescriptiveStatistics descriptiveStats = (DescriptiveStatistics) s; geometricMean = descriptiveStats.getGeometricMean(); sumOfSquares = descriptiveStats.getSumsq(); secondMoment = new SecondMoment().evaluate(descriptiveStats.getValues()); } else { final SummaryStatistics summaryStats = (SummaryStatistics) s; geometricMean = summaryStats.getGeometricMean(); secondMoment = summaryStats.getSecondMoment(); sumOfSquares = summaryStats.getSumsq(); } nav.where(measureDimension, MEASURE_HIGHEST_VALUE).put(highestValue); addAttachment(nav, delegate.getMaxAnnotation(), column); nav.where(measureDimension, MEASURE_LOWEST_VALUE).put(lowestValue); addAttachment(nav, delegate.getMinAnnotation(), column); nav.where(measureDimension, MEASURE_SUM).put(sum); nav.where(measureDimension, MEASURE_MEAN).put(mean); nav.where(measureDimension, MEASURE_GEOMETRIC_MEAN).put(geometricMean); nav.where(measureDimension, MEASURE_STANDARD_DEVIATION).put(standardDeviation); nav.where(measureDimension, MEASURE_VARIANCE).put(variance); nav.where(measureDimension, MEASURE_SUM_OF_SQUARES).put(sumOfSquares); nav.where(measureDimension, MEASURE_SECOND_MOMENT).put(secondMoment); if (descriptiveStatistics) { final DescriptiveStatistics descriptiveStatistics = (DescriptiveStatistics) s; final double kurtosis = descriptiveStatistics.getKurtosis(); final double skewness = descriptiveStatistics.getSkewness(); final double median = descriptiveStatistics.getPercentile(50.0); final double percentile25 = descriptiveStatistics.getPercentile(25.0); final double percentile75 = descriptiveStatistics.getPercentile(75.0); nav.where(measureDimension, MEASURE_MEDIAN).put(median); nav.where(measureDimension, MEASURE_PERCENTILE25).put(percentile25); nav.where(measureDimension, MEASURE_PERCENTILE75).put(percentile75); nav.where(measureDimension, MEASURE_SKEWNESS).put(skewness); nav.where(measureDimension, MEASURE_KURTOSIS).put(kurtosis); } } } return new NumberAnalyzerResult(_columns, crosstab); }
From source file:org.datacleaner.beans.NumberAnalyzerResultReducer.java
@Override protected Serializable reduceValues(List<Object> slaveValues, String column, String measure, Collection<? extends NumberAnalyzerResult> results, Class<?> valueClass) { if (SUM_MEASURES.contains(measure)) { return sum(slaveValues); } else if (NumberAnalyzer.MEASURE_HIGHEST_VALUE.equals(measure)) { return maximum(slaveValues); } else if (NumberAnalyzer.MEASURE_LOWEST_VALUE.equals(measure)) { return minimum(slaveValues); } else if (NumberAnalyzer.MEASURE_MEAN.equals(measure)) { StatisticalSummary summary = getSummary(column, results); return summary.getMean(); } else if (NumberAnalyzer.MEASURE_STANDARD_DEVIATION.equals(measure)) { StatisticalSummary summary = getSummary(column, results); return summary.getStandardDeviation(); } else if (NumberAnalyzer.MEASURE_VARIANCE.equals(measure)) { StatisticalSummary summary = getSummary(column, results); return summary.getVariance(); }/*from w ww .j ava 2s. c o m*/ logger.warn("Encountered non-reduceable measure '{}'. Slave values are: {}", measure, slaveValues); return null; }
From source file:org.NooLab.math3.stat.inference.TTest.java
/** * Computes a <a href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc22.htm#formula"> * t statistic </a> to use in comparing the mean of the dataset described by * <code>sampleStats</code> to <code>mu</code>. * <p>/*from ww w.j a v a2s .co m*/ * This statistic can be used to perform a one sample t-test for the mean. * </p><p> * <strong>Preconditions</strong>: <ul> * <li><code>observed.getN() ≥ 2</code>. * </li></ul></p> * * @param mu comparison constant * @param sampleStats DescriptiveStatistics holding sample summary statitstics * @return t statistic * @throws NullArgumentException if <code>sampleStats</code> is <code>null</code> * @throws NumberIsTooSmallException if the number of samples is < 2 */ public double t(final double mu, final StatisticalSummary sampleStats) throws NullArgumentException, NumberIsTooSmallException { checkSampleData(sampleStats); return t(sampleStats.getMean(), mu, sampleStats.getVariance(), sampleStats.getN()); }
From source file:org.NooLab.math3.stat.inference.TTest.java
/** * Computes a 2-sample t statistic </a>, comparing the means of the datasets * described by two {@link StatisticalSummary} instances, without the * assumption of equal subpopulation variances. Use * {@link #homoscedasticT(StatisticalSummary, StatisticalSummary)} to * compute a t-statistic under the equal variances assumption. * <p>/*from w ww. j ava 2 s. c o m*/ * This statistic can be used to perform a two-sample t-test to compare * sample means.</p> * <p> * The returned t-statistic is</p> * <p> * <code> t = (m1 - m2) / sqrt(var1/n1 + var2/n2)</code> * </p><p> * where <strong><code>n1</code></strong> is the size of the first sample; * <strong><code> n2</code></strong> is the size of the second sample; * <strong><code> m1</code></strong> is the mean of the first sample; * <strong><code> m2</code></strong> is the mean of the second sample * <strong><code> var1</code></strong> is the variance of the first sample; * <strong><code> var2</code></strong> is the variance of the second sample * </p><p> * <strong>Preconditions</strong>: <ul> * <li>The datasets described by the two Univariates must each contain * at least 2 observations. * </li></ul></p> * * @param sampleStats1 StatisticalSummary describing data from the first sample * @param sampleStats2 StatisticalSummary describing data from the second sample * @return t statistic * @throws NullArgumentException if the sample statistics are <code>null</code> * @throws NumberIsTooSmallException if the number of samples is < 2 */ public double t(final StatisticalSummary sampleStats1, final StatisticalSummary sampleStats2) throws NullArgumentException, NumberIsTooSmallException { checkSampleData(sampleStats1); checkSampleData(sampleStats2); return t(sampleStats1.getMean(), sampleStats2.getMean(), sampleStats1.getVariance(), sampleStats2.getVariance(), sampleStats1.getN(), sampleStats2.getN()); }
From source file:org.NooLab.math3.stat.inference.TTest.java
/** * Computes a 2-sample t statistic, comparing the means of the datasets * described by two {@link StatisticalSummary} instances, under the * assumption of equal subpopulation variances. To compute a t-statistic * without the equal variances assumption, use * {@link #t(StatisticalSummary, StatisticalSummary)}. * <p>/* w ww . ja va 2s. c om*/ * This statistic can be used to perform a (homoscedastic) two-sample * t-test to compare sample means.</p> * <p> * The t-statistic returned is</p> * <p> * <code> t = (m1 - m2) / (sqrt(1/n1 +1/n2) sqrt(var))</code> * </p><p> * where <strong><code>n1</code></strong> is the size of first sample; * <strong><code> n2</code></strong> is the size of second sample; * <strong><code> m1</code></strong> is the mean of first sample; * <strong><code> m2</code></strong> is the mean of second sample * and <strong><code>var</code></strong> is the pooled variance estimate: * </p><p> * <code>var = sqrt(((n1 - 1)var1 + (n2 - 1)var2) / ((n1-1) + (n2-1)))</code> * </p><p> * with <strong><code>var1<code></strong> the variance of the first sample and * <strong><code>var2</code></strong> the variance of the second sample. * </p><p> * <strong>Preconditions</strong>: <ul> * <li>The datasets described by the two Univariates must each contain * at least 2 observations. * </li></ul></p> * * @param sampleStats1 StatisticalSummary describing data from the first sample * @param sampleStats2 StatisticalSummary describing data from the second sample * @return t statistic * @throws NullArgumentException if the sample statistics are <code>null</code> * @throws NumberIsTooSmallException if the number of samples is < 2 */ public double homoscedasticT(final StatisticalSummary sampleStats1, final StatisticalSummary sampleStats2) throws NullArgumentException, NumberIsTooSmallException { checkSampleData(sampleStats1); checkSampleData(sampleStats2); return homoscedasticT(sampleStats1.getMean(), sampleStats2.getMean(), sampleStats1.getVariance(), sampleStats2.getVariance(), sampleStats1.getN(), sampleStats2.getN()); }
From source file:org.NooLab.math3.stat.inference.TTest.java
/** * Returns the <i>observed significance level</i>, or * <i>p-value</i>, associated with a one-sample, two-tailed t-test * comparing the mean of the dataset described by <code>sampleStats</code> * with the constant <code>mu</code>. * <p>//from w ww . j a va2 s . com * The number returned is the smallest significance level * at which one can reject the null hypothesis that the mean equals * <code>mu</code> in favor of the two-sided alternative that the mean * is different from <code>mu</code>. For a one-sided test, divide the * returned value by 2.</p> * <p> * <strong>Usage Note:</strong><br> * The validity of the test depends on the assumptions of the parametric * t-test procedure, as discussed * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> * here</a></p> * <p> * <strong>Preconditions</strong>: <ul> * <li>The sample must contain at least 2 observations. * </li></ul></p> * * @param mu constant value to compare sample mean against * @param sampleStats StatisticalSummary describing sample data * @return p-value * @throws NullArgumentException if <code>sampleStats</code> is <code>null</code> * @throws NumberIsTooSmallException if the number of samples is < 2 * @throws MaxCountExceededException if an error occurs computing the p-value */ public double tTest(final double mu, final StatisticalSummary sampleStats) throws NullArgumentException, NumberIsTooSmallException, MaxCountExceededException { checkSampleData(sampleStats); return tTest(sampleStats.getMean(), mu, sampleStats.getVariance(), sampleStats.getN()); }
From source file:org.NooLab.math3.stat.inference.TTest.java
/** * Returns the <i>observed significance level</i>, or * <i>p-value</i>, associated with a two-sample, two-tailed t-test * comparing the means of the datasets described by two StatisticalSummary * instances.// w w w .ja v a 2s. c o m * <p> * The number returned is the smallest significance level * at which one can reject the null hypothesis that the two means are * equal in favor of the two-sided alternative that they are different. * For a one-sided test, divide the returned value by 2.</p> * <p> * The test does not assume that the underlying population variances are * equal and it uses approximated degrees of freedom computed from the * sample data to compute the p-value. To perform the test assuming * equal variances, use * {@link #homoscedasticTTest(StatisticalSummary, StatisticalSummary)}.</p> * <p> * <strong>Usage Note:</strong><br> * The validity of the p-value depends on the assumptions of the parametric * t-test procedure, as discussed * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html"> * here</a></p> * <p> * <strong>Preconditions</strong>: <ul> * <li>The datasets described by the two Univariates must each contain * at least 2 observations. * </li></ul></p> * * @param sampleStats1 StatisticalSummary describing data from the first sample * @param sampleStats2 StatisticalSummary describing data from the second sample * @return p-value for t-test * @throws NullArgumentException if the sample statistics are <code>null</code> * @throws NumberIsTooSmallException if the number of samples is < 2 * @throws MaxCountExceededException if an error occurs computing the p-value */ public double tTest(final StatisticalSummary sampleStats1, final StatisticalSummary sampleStats2) throws NullArgumentException, NumberIsTooSmallException, MaxCountExceededException { checkSampleData(sampleStats1); checkSampleData(sampleStats2); return tTest(sampleStats1.getMean(), sampleStats2.getMean(), sampleStats1.getVariance(), sampleStats2.getVariance(), sampleStats1.getN(), sampleStats2.getN()); }
From source file:org.NooLab.math3.stat.inference.TTest.java
/** * Returns the <i>observed significance level</i>, or * <i>p-value</i>, associated with a two-sample, two-tailed t-test * comparing the means of the datasets described by two StatisticalSummary * instances, under the hypothesis of equal subpopulation variances. To * perform a test without the equal variances assumption, use * {@link #tTest(StatisticalSummary, StatisticalSummary)}. * <p>// w w w . j a v a 2 s. c om * The number returned is the smallest significance level * at which one can reject the null hypothesis that the two means are * equal in favor of the two-sided alternative that they are different. * For a one-sided test, divide the returned value by 2.</p> * <p> * See {@link #homoscedasticT(double[], double[])} for the formula used to * compute the t-statistic. The sum of the sample sizes minus 2 is used as * the degrees of freedom.</p> * <p> * <strong>Usage Note:</strong><br> * The validity of the p-value depends on the assumptions of the parametric * t-test procedure, as discussed * <a href="http://www.basic.nwu.edu/statguidefiles/ttest_unpaired_ass_viol.html">here</a> * </p><p> * <strong>Preconditions</strong>: <ul> * <li>The datasets described by the two Univariates must each contain * at least 2 observations. * </li></ul></p> * * @param sampleStats1 StatisticalSummary describing data from the first sample * @param sampleStats2 StatisticalSummary describing data from the second sample * @return p-value for t-test * @throws NullArgumentException if the sample statistics are <code>null</code> * @throws NumberIsTooSmallException if the number of samples is < 2 * @throws MaxCountExceededException if an error occurs computing the p-value */ public double homoscedasticTTest(final StatisticalSummary sampleStats1, final StatisticalSummary sampleStats2) throws NullArgumentException, NumberIsTooSmallException, MaxCountExceededException { checkSampleData(sampleStats1); checkSampleData(sampleStats2); return homoscedasticTTest(sampleStats1.getMean(), sampleStats2.getMean(), sampleStats1.getVariance(), sampleStats2.getVariance(), sampleStats1.getN(), sampleStats2.getN()); }