List of usage examples for org.apache.commons.math3.stat.descriptive DescriptiveStatistics getGeometricMean
public double getGeometricMean()
From source file:com.facebook.presto.tests.AbstractTestQueries.java
@Test public void testTableSamplePoissonizedRescaled() throws Exception { DescriptiveStatistics stats = new DescriptiveStatistics(); long total = (long) computeExpected("SELECT COUNT(*) FROM orders", ImmutableList.of(BIGINT)) .getMaterializedRows().get(0).getField(0); for (int i = 0; i < 100; i++) { String value = (String) computeActual( "SELECT COUNT(*) FROM orders TABLESAMPLE POISSONIZED (50) RESCALED APPROXIMATE AT 95 CONFIDENCE") .getMaterializedRows().get(0).getField(0); stats.addValue(Long.parseLong(value.split(" ")[0]) * 1.0 / total); }//www .j a v a 2 s . co m double mean = stats.getGeometricMean(); assertTrue(mean > 0.90 && mean < 1.1, format("Expected sample to be rescaled to ~1.0, but was %s", mean)); assertTrue(stats.getVariance() > 0, "Samples all had the exact same size"); }
From source file:io.prestosql.tests.AbstractTestQueries.java
@Test public void testTableSampleBernoulli() { DescriptiveStatistics stats = new DescriptiveStatistics(); int total = computeExpected("SELECT orderkey FROM orders", ImmutableList.of(BIGINT)).getMaterializedRows() .size();//from w w w. ja va2 s . c o m for (int i = 0; i < 100; i++) { List<MaterializedRow> values = computeActual("SELECT orderkey FROM orders TABLESAMPLE BERNOULLI (50)") .getMaterializedRows(); assertEquals(values.size(), ImmutableSet.copyOf(values).size(), "TABLESAMPLE produced duplicate rows"); stats.addValue(values.size() * 1.0 / total); } double mean = stats.getGeometricMean(); assertTrue(mean > 0.45 && mean < 0.55, format("Expected mean sampling rate to be ~0.5, but was %s", mean)); }
From source file:org.apache.solr.client.solrj.io.eval.DescribeEvaluator.java
@Override public Object doWork(Object value) throws IOException { if (!(value instanceof List<?>)) { throw new IOException( String.format(Locale.ROOT, "Invalid expression %s - expecting a numeric list but found %s", toExpression(constructingFactory), value.getClass().getSimpleName())); }//from w w w . j ava 2 s.c o m // we know each value is a BigDecimal or a list of BigDecimals DescriptiveStatistics descriptiveStatistics = new DescriptiveStatistics(); ((List<?>) value).stream().mapToDouble(innerValue -> ((BigDecimal) innerValue).doubleValue()) .forEach(innerValue -> descriptiveStatistics.addValue(innerValue)); Map<String, Number> map = new HashMap<>(); map.put("max", descriptiveStatistics.getMax()); map.put("mean", descriptiveStatistics.getMean()); map.put("min", descriptiveStatistics.getMin()); map.put("stdev", descriptiveStatistics.getStandardDeviation()); map.put("sum", descriptiveStatistics.getSum()); map.put("N", descriptiveStatistics.getN()); map.put("var", descriptiveStatistics.getVariance()); map.put("kurtosis", descriptiveStatistics.getKurtosis()); map.put("skewness", descriptiveStatistics.getSkewness()); map.put("popVar", descriptiveStatistics.getPopulationVariance()); map.put("geometricMean", descriptiveStatistics.getGeometricMean()); map.put("sumsq", descriptiveStatistics.getSumsq()); return new Tuple(map); }
From source file:org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe.CrossDomainDupesReducer.java
@Override public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { filter.clear();/*from w ww.java 2 s .c om*/ double crossDomainDupesCount = 0; double totalHitsCount = 0; double uniqueRootDomainsCount = 0; double uniqueIPs = 0; double validDupePatternMatches = 0; URLFPV2 rootFP = URLUtils.getURLFPV2FromHost(key.toString()); URLFPV2 fp = new URLFPV2(); int sampleCount = 0; ArrayList<Integer> ipAddresses = new ArrayList<Integer>(); JsonArray thisHostsDupes = new JsonArray(); DescriptiveStatistics lengthStats = new DescriptiveStatistics(); while (values.hasNext()) { JsonArray jsonArray = parser.parse(values.next().toString()).getAsJsonArray(); for (JsonElement elem : jsonArray) { totalHitsCount++; fp.setRootDomainHash(elem.getAsJsonObject().get("dh").getAsLong()); if (fp.getRootDomainHash() != rootFP.getRootDomainHash()) { crossDomainDupesCount++; fp.setDomainHash(fp.getRootDomainHash()); fp.setUrlHash(fp.getRootDomainHash()); // track length average .... lengthStats.addValue(elem.getAsJsonObject().get("length").getAsInt()); if (!filter.isPresent(fp)) { uniqueRootDomainsCount++; filter.add(fp); if (sampleCount < samples.length) { String url = elem.getAsJsonObject().get("url").getAsString(); GoogleURL urlObject = new GoogleURL(url); if (knownValidDupesPatterns.matcher(urlObject.getCanonicalURL()).find()) { validDupePatternMatches++; } samples[sampleCount++] = url; } } } else { thisHostsDupes.add(elem); } int ipAddress = elem.getAsJsonObject().get("ip").getAsInt(); fp.setRootDomainHash(ipAddress); fp.setDomainHash(ipAddress); fp.setUrlHash(ipAddress); if (!filter.isPresent(fp)) { uniqueIPs++; filter.add(fp); ipAddresses.add(ipAddress); } } } if (totalHitsCount > 15 && crossDomainDupesCount >= 2) { double otherDomainToLocalScore = otherDomainToLocalDomainScore(totalHitsCount, crossDomainDupesCount); double spamIPScore = spamHostScore(totalHitsCount, crossDomainDupesCount, uniqueIPs); if (otherDomainToLocalScore >= .50 || spamIPScore > .50) { JsonObject objectOut = new JsonObject(); objectOut.addProperty("ratio", (crossDomainDupesCount / totalHitsCount)); objectOut.addProperty("totalHits", totalHitsCount); objectOut.addProperty("crossDomainDupes", crossDomainDupesCount); objectOut.addProperty("uniqueRootDomains", uniqueRootDomainsCount); objectOut.addProperty("otherDomainToLocalScore", otherDomainToLocalScore); objectOut.addProperty("spamIPScore", spamIPScore); objectOut.addProperty("validDupeMatches", validDupePatternMatches); objectOut.addProperty("content-len-mean", lengthStats.getMean()); objectOut.addProperty("content-len-geo-mean", lengthStats.getGeometricMean()); for (int i = 0; i < sampleCount; ++i) { objectOut.addProperty("sample-" + i, samples[i]); } // compute path edit distance ... if (sampleCount > 1) { int sampleEditDistanceSize = Math.min(sampleCount, 5); DescriptiveStatistics stats = new DescriptiveStatistics(); for (int j = 0; j < sampleEditDistanceSize; ++j) { for (int k = 0; k < sampleEditDistanceSize; ++k) { if (k != j) { GoogleURL urlObjectA = new GoogleURL(samples[j]); GoogleURL urlObjectB = new GoogleURL(samples[k]); if (urlObjectA.getPath().length() < 100 && urlObjectB.getPath().length() < 100) { stats.addValue(StringUtils.getLevenshteinDistance(urlObjectA.getPath(), urlObjectB.getPath())); } } } } if (stats.getMean() != 0.0) { objectOut.addProperty("lev-distance-mean", stats.getMean()); objectOut.addProperty("lev-distance-geomean", stats.getGeometricMean()); } } JsonArray ipAddressArray = new JsonArray(); for (int j = 0; j < Math.min(1000, ipAddresses.size()); ++j) { ipAddressArray.add(new JsonPrimitive(ipAddresses.get(j))); } if (ipAddresses.size() != 0) { objectOut.add("ipList", ipAddressArray); } objectOut.add("thisHostDupes", thisHostsDupes); output.collect(key, new TextBytes(objectOut.toString())); } } }
From source file:org.deidentifier.arx.aggregates.StatisticsBuilder.java
/** * Returns summary statistics for all attributes. * /* w w w . j a va 2s .com*/ * @param listwiseDeletion A flag enabling list-wise deletion * @return */ @SuppressWarnings({ "unchecked", "rawtypes" }) public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) { // Reset stop flag interrupt.value = false; Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>(); Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>(); Map<String, DataScale> scales = new HashMap<String, DataScale>(); Map<String, GeometricMean> geomean = new HashMap<String, GeometricMean>(); // Detect scales for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Scale DataScale scale = type.getDescription().getScale(); // Try to replace nominal scale with ordinal scale based on base data type if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) { if (!(handle.getBaseDataType(attribute) instanceof ARXString) && getHierarchy(col, true) != null) { scale = DataScale.ORDINAL; } } // Store scales.put(attribute, scale); statistics.put(attribute, new DescriptiveStatistics()); geomean.put(attribute, new GeometricMean()); ordinal.put(attribute, getSummaryStatisticsOrdinal(handle.getGeneralization(attribute), handle.getDataType(attribute), handle.getBaseDataType(attribute), getHierarchy(col, true))); } // Compute summary statistics for (int row = 0; row < handle.getNumRows(); row++) { // Check, if we should include this row boolean include = true; if (listwiseDeletion) { for (int col = 0; col < handle.getNumColumns(); col++) { if (handle.isOutlier(row) || DataType.isNull(handle.getValue(row, col))) { include = false; break; } } } // Check checkInterrupt(); // If yes, add if (include) { // For each column for (int col = 0; col < handle.getNumColumns(); col++) { // Meta String value = handle.getValue(row, col); String attribute = handle.getAttributeName(col); DataType<?> type = handle.getDataType(attribute); // Analyze if (!DataType.isAny(value) && !DataType.isNull(value)) { ordinal.get(attribute).addValue(value); if (type instanceof DataTypeWithRatioScale) { double doubleValue = ((DataTypeWithRatioScale) type).toDouble(type.parse(value)); statistics.get(attribute).addValue(doubleValue); geomean.get(attribute).increment(doubleValue + 1d); } } } } } // Convert Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>(); for (int col = 0; col < handle.getNumColumns(); col++) { // Check checkInterrupt(); // Depending on scale String attribute = handle.getAttributeName(col); DataScale scale = scales.get(attribute); DataType<T> type = (DataType<T>) handle.getDataType(attribute); ordinal.get(attribute).analyze(); if (scale == DataScale.NOMINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put(attribute, new StatisticsSummary<T>(DataScale.NOMINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()))); } else if (scale == DataScale.ORDINAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); result.put(attribute, new StatisticsSummary<T>(DataScale.ORDINAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()))); } else if (scale == DataScale.INTERVAL) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); boolean isPeriod = type.getDescription().getWrappedClass() == Date.class; // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put(attribute, new StatisticsSummary<T>(DataScale.INTERVAL, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), isPeriod, true), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), isPeriod, true), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, isPeriod, false), toValue(type, stddev), stddev, toString(type, range, isPeriod, false), toValue(type, range), stats2.getMax() - stats2.getMin(), toString(type, kurtosis, isPeriod, false), toValue(type, kurtosis), kurtosis)); } else if (scale == DataScale.RATIO) { StatisticsSummaryOrdinal stats = ordinal.get(attribute); DescriptiveStatistics stats2 = statistics.get(attribute); GeometricMean geo = geomean.get(attribute); // TODO: Something is wrong with commons math's kurtosis double kurtosis = stats2.getKurtosis(); kurtosis = kurtosis < 0d ? Double.NaN : kurtosis; double range = stats2.getMax() - stats2.getMin(); double stddev = Math.sqrt(stats2.getVariance()); result.put(attribute, new StatisticsSummary<T>(DataScale.RATIO, stats.getNumberOfMeasures(), stats.getMode(), type.parse(stats.getMode()), stats.getMedian(), type.parse(stats.getMedian()), stats.getMin(), type.parse(stats.getMin()), stats.getMax(), type.parse(stats.getMax()), toString(type, stats2.getMean(), false, false), toValue(type, stats2.getMean()), stats2.getMean(), toString(type, stats2.getVariance(), false, false), toValue(type, stats2.getVariance()), stats2.getVariance(), toString(type, stats2.getPopulationVariance(), false, false), toValue(type, stats2.getPopulationVariance()), stats2.getPopulationVariance(), toString(type, stddev, false, false), toValue(type, stddev), stddev, toString(type, range, false, false), toValue(type, range), range, toString(type, kurtosis, false, false), toValue(type, kurtosis), kurtosis, toString(type, geo.getResult() - 1d, false, false), toValue(type, geo.getResult() - 1d), stats2.getGeometricMean())); } } return result; }
From source file:org.lightjason.agentspeak.action.buildin.math.statistic.EStatisticValue.java
/** * returns a statistic value/* ww w. ja va 2 s. com*/ * * @param p_statistic statistic object * @return statistic value */ public final double value(final DescriptiveStatistics p_statistic) { switch (this) { case GEOMETRICMEAN: return p_statistic.getGeometricMean(); case MAX: return p_statistic.getMax(); case MIN: return p_statistic.getMin(); case COUNT: return p_statistic.getN(); case POPULATIONVARIANCE: return p_statistic.getPopulationVariance(); case QUADRATICMEAN: return p_statistic.getQuadraticMean(); case STANDARDDEVIATION: return p_statistic.getStandardDeviation(); case SUM: return p_statistic.getSum(); case SUMSQUARE: return p_statistic.getSumsq(); case VARIANCE: return p_statistic.getVariance(); case MEAN: return p_statistic.getMean(); case KURTIOSIS: return p_statistic.getKurtosis(); default: throw new CIllegalStateException( org.lightjason.agentspeak.common.CCommon.languagestring(this, "unknown", this)); } }
From source file:org.lightjason.agentspeak.action.builtin.math.statistic.EStatisticValue.java
/** * returns a statistic value/*from ww w . j a va 2 s .co m*/ * * @param p_statistic statistic object * @return statistic value */ public final double value(@Nonnull final DescriptiveStatistics p_statistic) { switch (this) { case GEOMETRICMEAN: return p_statistic.getGeometricMean(); case MAX: return p_statistic.getMax(); case MIN: return p_statistic.getMin(); case COUNT: return p_statistic.getN(); case POPULATIONVARIANCE: return p_statistic.getPopulationVariance(); case QUADRATICMEAN: return p_statistic.getQuadraticMean(); case STANDARDDEVIATION: return p_statistic.getStandardDeviation(); case SUM: return p_statistic.getSum(); case SUMSQUARE: return p_statistic.getSumsq(); case VARIANCE: return p_statistic.getVariance(); case MEAN: return p_statistic.getMean(); case KURTIOSIS: return p_statistic.getKurtosis(); default: throw new CIllegalStateException( org.lightjason.agentspeak.common.CCommon.languagestring(this, "unknown", this)); } }
From source file:org.obiba.opal.web.magma.Dtos.java
@SuppressWarnings({ "OverlyLongMethod", "PMD.NcssMethodCount" }) public static Math.ContinuousSummaryDto.Builder asDto(ContinuousVariableSummary summary) { DescriptiveStatistics descriptiveStats = summary.getDescriptiveStats(); Math.DescriptiveStatsDto.Builder descriptiveBuilder = Math.DescriptiveStatsDto.newBuilder() .setN(descriptiveStats.getN()).addAllPercentiles(summary.getPercentiles()); if (isNumeric(descriptiveStats.getMin())) descriptiveBuilder.setMin(descriptiveStats.getMin()); if (isNumeric(descriptiveStats.getMax())) descriptiveBuilder.setMax(descriptiveStats.getMax()); if (isNumeric(descriptiveStats.getMean())) descriptiveBuilder.setMean(descriptiveStats.getMean()); if (isNumeric(descriptiveStats.getSum())) descriptiveBuilder.setSum(descriptiveStats.getSum()); if (isNumeric(descriptiveStats.getSumsq())) descriptiveBuilder.setSumsq(descriptiveStats.getSumsq()); if (isNumeric(descriptiveStats.getStandardDeviation())) { descriptiveBuilder.setStdDev(descriptiveStats.getStandardDeviation()); }/*from ww w . j a v a 2s . co m*/ if (isNumeric(descriptiveStats.getVariance())) descriptiveBuilder.setVariance(descriptiveStats.getVariance()); if (isNumeric(descriptiveStats.getSkewness())) descriptiveBuilder.setSkewness(descriptiveStats.getSkewness()); if (isNumeric(descriptiveStats.getGeometricMean())) { descriptiveBuilder.setGeometricMean(descriptiveStats.getGeometricMean()); } if (isNumeric(descriptiveStats.getKurtosis())) descriptiveBuilder.setKurtosis(descriptiveStats.getKurtosis()); double median = descriptiveStats.apply(new Median()); if (isNumeric(median)) descriptiveBuilder.setMedian(median); if (isNumeric(descriptiveStats.getVariance())) descriptiveBuilder.setVariance(descriptiveStats.getVariance()); Math.ContinuousSummaryDto.Builder continuousBuilder = Math.ContinuousSummaryDto.newBuilder() .addAllDistributionPercentiles(summary.getDistributionPercentiles()); for (IntervalFrequency.Interval interval : summary.getIntervalFrequencies()) { Math.IntervalFrequencyDto.Builder freqBuilder = Math.IntervalFrequencyDto.newBuilder() .setFreq(interval.getFreq()); if (isNumeric(interval.getLower())) freqBuilder.setLower(interval.getLower()); if (isNumeric(interval.getUpper())) freqBuilder.setUpper(interval.getUpper()); if (isNumeric(interval.getDensity())) freqBuilder.setDensity(interval.getDensity()); if (isNumeric(interval.getDensityPct())) freqBuilder.setDensityPct(interval.getDensityPct()); continuousBuilder.addIntervalFrequency(freqBuilder); } for (ContinuousVariableSummary.Frequency frequency : summary.getFrequencies()) { Math.FrequencyDto.Builder freqBuilder = Math.FrequencyDto.newBuilder() // .setValue(frequency.getValue()) // .setFreq(frequency.getFreq())// .setMissing(frequency.isMissing()); if (isNumeric(frequency.getPct())) freqBuilder.setPct(frequency.getPct()); continuousBuilder.addFrequencies(freqBuilder); } return continuousBuilder.setSummary(descriptiveBuilder); }