List of usage examples for org.apache.commons.math.stat.descriptive.rank Percentile Percentile
public Percentile()
From source file:edu.cornell.med.icb.learning.PercentileScalingRowProcessor.java
private void observeStatistics(final MutableString featureId, final int featureIndex, final double[] trimmedArray) { final Percentile lowerPercentile = new Percentile(); lowerPercentile.setQuantile(20);/*from ww w . j a va 2 s .co m*/ final double min = lowerPercentile.evaluate(trimmedArray); final Percentile higherPercentile = new Percentile(); higherPercentile.setQuantile(80); final double max = higherPercentile.evaluate(trimmedArray); final Percentile medianPercentile = new Percentile(); medianPercentile.setQuantile(50); final double median = medianPercentile.evaluate(trimmedArray); final double range = max - min; featureIndex2ScaleMedian[featureIndex] = median; featureIndex2ScaleRange[featureIndex] = range; if (featureId != null) { probesetScaleMedianMap.put(featureId, median); probesetScaleRangeMap.put(featureId, range); } if (LOG.isTraceEnabled()) { LOG.trace(String.format("training, featureIndex/columnId %d/%s lower: %f higher %f median %f ", featureIndex, featureId, min, max, median)); } }
From source file:edu.cornell.med.icb.goby.modes.CompactFileStatsMode.java
/** * Print statistics about a reads file in the Goby compact form. * * @param file The file to display statistics about * @throws IOException if the file cannot be read *//*from w ww. j a va2 s. c o m*/ private void describeCompactReads(final File file) throws IOException { stream.printf("Compact reads filename = %s%n", file); // keep the read lengths for computing quantiles final DoubleArrayList readLengths = new DoubleArrayList(); int minLength = Integer.MAX_VALUE; int maxLength = Integer.MIN_VALUE; int numberOfIdentifiers = 0; int numberOfDescriptions = 0; int numberOfSequences = 0; int numberOfSequencePairs = 0; int numberOfQualityScores = 0; int numberOfQualityScorePairs = 0; long totalReadLength = 0; long totalReadLengthPair = 0; final DistinctIntValueCounterBitSet allQueryIndices = new DistinctIntValueCounterBitSet(); ReadsReader reader = null; boolean checkedForPaired = false; try { final long size = file.length(); reader = new ReadsReader(FileUtils.openInputStream(file)); for (final Reads.ReadEntry entry : reader) { final int readLength = entry.getReadLength(); for (int i = 0; i < entry.getMetaDataCount(); i++) { Reads.MetaData metaData = entry.getMetaData(i); stream.printf("meta-data key=%s value=%s%n", metaData.getKey(), metaData.getValue()); } // across this file allQueryIndices.observe(entry.getReadIndex()); totalReadLength += readLength; totalReadLengthPair += entry.getReadLengthPair(); // across all files numberOfReads++; numberOfDescriptions += entry.hasDescription() ? 1 : 0; cumulativeReadLength += readLength; if (verbose && entry.hasDescription()) { stream.println("Description found: " + entry.getDescription()); } numberOfIdentifiers += entry.hasReadIdentifier() ? 1 : 0; if (verbose && entry.hasReadIdentifier()) { stream.printf("Identifier found: %s / size=%,d%n", entry.getReadIdentifier(), readLength); } numberOfSequences += entry.hasSequence() && !entry.getSequence().isEmpty() ? 1 : 0; final boolean samplePaired = entry.hasSequencePair() && !entry.getSequencePair().isEmpty(); if (samplePaired) { numberOfSequencePairs += 1; } if (!checkedForPaired) { // Check only the very first entry. checkedForPaired = true; pairedSamples.add(samplePaired); } if (entry.hasQualityScores() && !entry.getQualityScores().isEmpty()) { numberOfQualityScores += 1; final int qualityLength = entry.getQualityScores().size(); minQualityLength = Math.min(minQualityLength, qualityLength); maxQualityLength = Math.max(maxQualityLength, qualityLength); } numberOfQualityScorePairs += entry.hasQualityScoresPair() && !entry.getQualityScoresPair().isEmpty() ? 1 : 0; // we only need to keep all the read lengths if quantiles are being computed if (computeQuantiles) { readLengths.add(readLength); } minLength = Math.min(minLength, readLength); maxLength = Math.max(maxLength, readLength); // adjust the min/max length of across all files minReadLength = Math.min(minReadLength, readLength); maxReadLength = Math.max(maxReadLength, readLength); } stream.printf("Average bytes per entry: %f%n", divide(size, allQueryIndices.count())); stream.printf("Average bytes per base: %f%n", divide(size, cumulativeReadLength)); } finally { if (reader != null) { reader.close(); } } final int numReadEntries = allQueryIndices.count(); stream.printf("Has identifiers = %s (%,d) %n", numberOfIdentifiers > 0, numberOfIdentifiers); stream.printf("Has descriptions = %s (%,d) %n", numberOfDescriptions > 0, numberOfDescriptions); stream.printf("Has sequences = %s (%,d) %n", numberOfSequences > 0, numberOfSequences); stream.printf("Has sequencePairs = %s (%,d) %n", numberOfSequencePairs > 0, numberOfSequencePairs); stream.printf("Has quality scores = %s (%,d) %n", numberOfQualityScores > 0, numberOfQualityScores); stream.printf("Has quality score Pairs = %s (%,d) %n", numberOfQualityScorePairs > 0, numberOfQualityScorePairs); stream.printf("Number of entries = %,d%n", numReadEntries); stream.printf("Min read length = %,d%n", numReadEntries > 0 ? minLength : 0); stream.printf("Max read length = %,d%n", numReadEntries > 0 ? maxLength : 0); stream.printf("Min quality length = %,d%n", numberOfQualityScores > 0 ? minQualityLength : 0); stream.printf("Max quality length = %,d%n", numberOfQualityScores > 0 ? maxQualityLength : 0); stream.printf("Avg read length = %,d%n", numReadEntries > 0 ? totalReadLength / numReadEntries : 0); stream.printf("Avg read pair length = %,d%n", numReadEntries > 0 ? totalReadLengthPair / numReadEntries : 0); // compute quantiles if (computeQuantiles) { final Percentile percentile = new Percentile(); final double[] increasingReadLengths = readLengths.toDoubleArray(); Arrays.sort(increasingReadLengths); stream.printf("Read length quantiles = [ "); for (int quantile = 1; quantile < numberOfQuantiles + 1; quantile++) { stream.printf("%,f ", percentile.evaluate(increasingReadLengths, quantile)); } stream.printf("]%n"); } }
From source file:com.atolcd.pentaho.di.trans.steps.gisgroupby.GisGroupBy.java
/** * Used for junits in GroupByAggregationNullsTest * /*w w w . ja va2 s . c om*/ * @return * @throws KettleValueException */ Object[] getAggregateResult() throws KettleValueException { Object[] result = new Object[data.subjectnrs.length]; if (data.subjectnrs != null) { for (int i = 0; i < data.subjectnrs.length; i++) { Object ag = data.agg[i]; switch (meta.getAggregateType()[i]) { case GisGroupByMeta.TYPE_GROUP_SUM: break; case GisGroupByMeta.TYPE_GROUP_AVERAGE: ag = ValueDataUtil.divide(data.aggMeta.getValueMeta(i), ag, new ValueMeta("c", ValueMetaInterface.TYPE_INTEGER), new Long(data.counts[i])); break; case GisGroupByMeta.TYPE_GROUP_MEDIAN: case GisGroupByMeta.TYPE_GROUP_PERCENTILE: double percentile = 50.0; if (meta.getAggregateType()[i] == GisGroupByMeta.TYPE_GROUP_PERCENTILE) { percentile = Double.parseDouble(meta.getValueField()[i]); } @SuppressWarnings("unchecked") List<Double> valuesList = (List<Double>) data.agg[i]; double[] values = new double[valuesList.size()]; for (int v = 0; v < values.length; v++) { values[v] = valuesList.get(v); } ag = new Percentile().evaluate(values, percentile); break; case GisGroupByMeta.TYPE_GROUP_COUNT_ANY: case GisGroupByMeta.TYPE_GROUP_COUNT_ALL: ag = new Long(data.counts[i]); break; case GisGroupByMeta.TYPE_GROUP_COUNT_DISTINCT: break; case GisGroupByMeta.TYPE_GROUP_MIN: break; case GisGroupByMeta.TYPE_GROUP_MAX: break; case GisGroupByMeta.TYPE_GROUP_STANDARD_DEVIATION: double sum = (Double) ag / data.counts[i]; ag = Double.valueOf(Math.sqrt(sum)); break; case GisGroupByMeta.TYPE_GROUP_CONCAT_COMMA: case GisGroupByMeta.TYPE_GROUP_CONCAT_STRING: ag = ((StringBuilder) ag).toString(); break; // GIS : Union des gomtries case GisGroupByMeta.TYPE_GROUP_GEOMETRY_UNION: Geometry geomUnionGroup = ((GeometryInterface) data.aggMeta.getValueMeta(i)).getGeometry(ag); UnaryUnionOp unionOperateor = new UnaryUnionOp(geomUnionGroup); Geometry geometryUnion = unionOperateor.union(); geometryUnion = GeometryUtils.getMergedGeometry(geometryUnion); geometryUnion.setSRID(geometrySRID); ag = geometryUnion; break; // GIS : Extent des gomtries case GisGroupByMeta.TYPE_GROUP_GEOMETRY_EXTENT: Geometry geomExtentGroup = ((GeometryInterface) data.aggMeta.getValueMeta(i)).getGeometry(ag); Geometry geomtryExtent = geomExtentGroup.getEnvelope(); geomtryExtent.setSRID(geometrySRID); ag = geomtryExtent; break; // GIS : Aggrgation des gomtries case GisGroupByMeta.TYPE_GROUP_GEOMETRY_AGG: Geometry geomAggGroup = ((GeometryInterface) data.aggMeta.getValueMeta(i)).getGeometry(ag); Geometry geomtryAgg = geomAggGroup; geomtryAgg.setSRID(geometrySRID); ag = geomtryAgg; break; default: break; } /* * if ( ag == null && allNullsAreZero ) { // PDI-10250, 6960 * seems all rows for min function was nulls... // get output * subject meta based on original subject meta calculation * ValueMetaInterface vm = data.aggMeta.getValueMeta( i ); * * ag = ValueDataUtil.getZeroForValueMetaType( vm ); } */ result[i] = ag; } } return result; }
From source file:org.drugepi.hdps.ZBiasCalculator.java
public static void scoreVariables(List<HdpsVariable> variableList) { // copy variables list List<HdpsVariable> expSortVariableList = new ArrayList<HdpsVariable>(); List<HdpsVariable> outcomeSortVariableList = new ArrayList<HdpsVariable>(); for (HdpsVariable var : variableList) { var.zBiasScore = 0; if ((var.expAssocRankingVariable != HdpsVariable.INVALID) && (var.outcomeAssocRankingVariable != HdpsVariable.INVALID)) { expSortVariableList.add(var); outcomeSortVariableList.add(var); }/* w w w . j a v a 2 s. c om*/ } // sort variables by exposure association (strongest first) Collections.sort(expSortVariableList, new HdpsVariableReverseExposureAssociationComparator()); // sort variables by outcome association (weakest first) Collections.sort(outcomeSortVariableList, new HdpsVariableReverseOutcomeAssociationComparator()); Collections.reverse(outcomeSortVariableList); // create an array of outcome strengths double[] outcomeStrengths = new double[outcomeSortVariableList.size()]; for (int i = 0; i < outcomeStrengths.length; i++) outcomeStrengths[i] = outcomeSortVariableList.get(i).outcomeAssocRankingVariable; // array that will store breaks between deciles // find the median of outcome strength Percentile pctile = new Percentile(); // Find quintiles 1 through 5 of outcome weakness // AMONG the weakest half of the variables. // List is sorted strongest first, so the weakest variables // will be at the end // quintile 1 = weakest // don't use startsOfQuintile[0] double median = pctile.evaluate(outcomeStrengths, 50.0); int searchCeiling = Arrays.binarySearch(outcomeStrengths, median); if (searchCeiling < 0) searchCeiling = -(searchCeiling + 1); int startsOfQuintile[] = new int[7]; for (int quintile = 1; quintile <= 5; quintile++) { // find the probability that *begins* this quintile double p = (quintile - 1) * 20; if (p > 0) { double quintileStartP = pctile.evaluate(outcomeStrengths, 0, searchCeiling, (quintile - 1) * 20); startsOfQuintile[quintile] = Arrays.binarySearch(outcomeStrengths, quintileStartP); if (startsOfQuintile[quintile] < 0) startsOfQuintile[quintile] = -(startsOfQuintile[quintile] + 1); } else startsOfQuintile[quintile] = 0; } startsOfQuintile[6] = searchCeiling; // score the variables, BUT make quintile 5 the weakest for (int quintile = 1; quintile <= 5; quintile++) { for (int i = startsOfQuintile[quintile]; i < startsOfQuintile[quintile + 1]; i++) { HdpsVariable v = outcomeSortVariableList.get(i); v.zBiasScore = 6 - quintile; } } // for (HdpsVariable v: outcomeSortVariableList) { // System.out.printf("%s %1.4f %d\n", v.varName, v.outcomeAssocRankingVariable, v.zBiasScore); // } }
From source file:org.pentaho.di.trans.steps.groupby.GroupBy.java
/** * Used for junits in GroupByAggregationNullsTest * @return/*from ww w .j a v a 2s. com*/ * @throws KettleValueException */ Object[] getAggregateResult() throws KettleValueException { Object[] result = new Object[data.subjectnrs.length]; if (data.subjectnrs != null) { for (int i = 0; i < data.subjectnrs.length; i++) { Object ag = data.agg[i]; switch (meta.getAggregateType()[i]) { case GroupByMeta.TYPE_GROUP_SUM: break; case GroupByMeta.TYPE_GROUP_AVERAGE: ag = ValueDataUtil.divide(data.aggMeta.getValueMeta(i), ag, new ValueMeta("c", ValueMetaInterface.TYPE_INTEGER), new Long(data.counts[i])); break; case GroupByMeta.TYPE_GROUP_MEDIAN: case GroupByMeta.TYPE_GROUP_PERCENTILE: double percentile = 50.0; if (meta.getAggregateType()[i] == GroupByMeta.TYPE_GROUP_PERCENTILE) { percentile = Double.parseDouble(meta.getValueField()[i]); } @SuppressWarnings("unchecked") List<Double> valuesList = (List<Double>) data.agg[i]; double[] values = new double[valuesList.size()]; for (int v = 0; v < values.length; v++) { values[v] = valuesList.get(v); } ag = new Percentile().evaluate(values, percentile); break; case GroupByMeta.TYPE_GROUP_COUNT_ANY: case GroupByMeta.TYPE_GROUP_COUNT_ALL: ag = new Long(data.counts[i]); break; case GroupByMeta.TYPE_GROUP_COUNT_DISTINCT: break; case GroupByMeta.TYPE_GROUP_MIN: break; case GroupByMeta.TYPE_GROUP_MAX: break; case GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION: double sum = (Double) ag / data.counts[i]; ag = Double.valueOf(Math.sqrt(sum)); break; case GroupByMeta.TYPE_GROUP_CONCAT_COMMA: case GroupByMeta.TYPE_GROUP_CONCAT_STRING: ag = ((StringBuilder) ag).toString(); break; default: break; } if (ag == null && allNullsAreZero) { // PDI-10250, 6960 seems all rows for min function was nulls... // get output subject meta based on original subject meta calculation ValueMetaInterface vm = data.aggMeta.getValueMeta(i); ag = ValueDataUtil.getZeroForValueMetaType(vm); } result[i] = ag; } } return result; }
From source file:org.pentaho.di.trans.steps.memgroupby.MemoryGroupBy.java
/** * Used for junits in MemoryGroupByAggregationNullsTest * @param aggregate// w ww. j ava2 s .com * @return * @throws KettleValueException */ Object[] getAggregateResult(Aggregate aggregate) throws KettleValueException { Object[] result = new Object[data.subjectnrs.length]; if (data.subjectnrs != null) { for (int i = 0; i < data.subjectnrs.length; i++) { Object ag = aggregate.agg[i]; switch (meta.getAggregateType()[i]) { case MemoryGroupByMeta.TYPE_GROUP_SUM: break; case MemoryGroupByMeta.TYPE_GROUP_AVERAGE: ag = ValueDataUtil.divide(data.aggMeta.getValueMeta(i), ag, new ValueMeta("c", ValueMetaInterface.TYPE_INTEGER), new Long(aggregate.counts[i])); break; case MemoryGroupByMeta.TYPE_GROUP_MEDIAN: case MemoryGroupByMeta.TYPE_GROUP_PERCENTILE: double percentile = 50.0; if (meta.getAggregateType()[i] == MemoryGroupByMeta.TYPE_GROUP_PERCENTILE) { percentile = Double.parseDouble(meta.getValueField()[i]); } @SuppressWarnings("unchecked") List<Double> valuesList = (List<Double>) aggregate.agg[i]; double[] values = new double[valuesList.size()]; for (int v = 0; v < values.length; v++) { values[v] = valuesList.get(v); } ag = new Percentile().evaluate(values, percentile); break; case MemoryGroupByMeta.TYPE_GROUP_COUNT_ANY: case MemoryGroupByMeta.TYPE_GROUP_COUNT_ALL: ag = new Long(aggregate.counts[i]); break; case MemoryGroupByMeta.TYPE_GROUP_COUNT_DISTINCT: break; case MemoryGroupByMeta.TYPE_GROUP_MIN: break; case MemoryGroupByMeta.TYPE_GROUP_MAX: break; case MemoryGroupByMeta.TYPE_GROUP_STANDARD_DEVIATION: double sum = (Double) ag / aggregate.counts[i]; ag = Double.valueOf(Math.sqrt(sum)); break; case MemoryGroupByMeta.TYPE_GROUP_CONCAT_COMMA: case MemoryGroupByMeta.TYPE_GROUP_CONCAT_STRING: ag = ((StringBuilder) ag).toString(); break; default: break; } if (ag == null && allNullsAreZero) { // PDI-11530 seems all rows for min function was nulls... ValueMetaInterface vm = data.aggMeta.getValueMeta(i); ag = ValueDataUtil.getZeroForValueMetaType(vm); } result[i] = ag; } } return result; }