List of usage examples for org.apache.commons.math.distribution ChiSquaredDistributionImpl ChiSquaredDistributionImpl
public ChiSquaredDistributionImpl(double df)
From source file:geogebra.common.kernel.statistics.AlgoChiSquaredTest.java
/** * @param df/* w w w. j a va 2 s.c o m*/ * degree of freedom * @return implementation of ChiSquaredDistribution for given degree of * freedom */ ChiSquaredDistribution getChiSquaredDistribution(double df) { if (chisquared == null || chisquared.getDegreesOfFreedom() != df) chisquared = new ChiSquaredDistributionImpl(df); return chisquared; }
From source file:mlflex.EvaluationMetrics.java
private double CalculateLogRankStatisticTwoGroups(SurvivalGroups survivalsList) throws Exception { ArrayList<Double> allDiff1 = new ArrayList<Double>(); ArrayList<Double> allVar = new ArrayList<Double>(); for (double time : new ArrayList<Double>( Lists.Sort(new ArrayList(new HashSet<Double>(survivalsList.GetObservedTimes()))))) { double n = Lists.GreaterThan(survivalsList.GetAllTimes(), time, true).size(); double n1 = Lists.GreaterThan(survivalsList.GetGroup(0).GetAllTimes(), time, true).size(); double n2 = Lists.GreaterThan(survivalsList.GetGroup(1).GetAllTimes(), time, true).size(); double e = Lists.GetNumEqualTo(survivalsList.GetObservedTimes(), time); double e1 = Lists.GetNumEqualTo(survivalsList.GetGroup(0).GetObservedTimes(), time); double exp1 = e * (n1 / n); double var = n <= 1.0 ? 0.0 : (n1 * n2 * e * (n - e)) / (n * n * (n - 1)); allDiff1.add(e1 - exp1);/*from ww w .ja v a 2s. c o m*/ allVar.add(var); } double logRankStatistic = Math.pow(MathUtility.Sum(allDiff1), 2.0) / MathUtility.Sum(allVar); org.apache.commons.math.distribution.ChiSquaredDistributionImpl chi = new ChiSquaredDistributionImpl( survivalsList.Size() - 1); double chiSquareP = 1 - chi.cumulativeProbability(logRankStatistic); return chiSquareP; }
From source file:edu.utexas.cs.tactex.servercustomers.factoredcustomer.ProbabilityDistribution.java
ProbabilityDistribution(FactoredCustomerService service, Element xml) { if (null == randomSeedRepo) randomSeedRepo = (RandomSeedRepo) SpringApplicationContext.getBean("randomSeedRepo"); type = Enum.valueOf(DistType.class, xml.getAttribute("distribution")); switch (type) { case POINTMASS: case DEGENERATE: param1 = Double.parseDouble(xml.getAttribute("value")); sampler = new DegenerateSampler(param1); break;/*from w w w . ja v a 2 s . c o m*/ case UNIFORM: param1 = Double.parseDouble(xml.getAttribute("low")); param2 = Double.parseDouble(xml.getAttribute("high")); sampler = new UniformSampler(param1, param2); break; case INTERVAL: param1 = Double.parseDouble(xml.getAttribute("mean")); param2 = Double.parseDouble(xml.getAttribute("stdDev")); param3 = Double.parseDouble(xml.getAttribute("low")); param4 = Double.parseDouble(xml.getAttribute("high")); sampler = new IntervalSampler(param1, param2, param3, param4); break; case NORMAL: case GAUSSIAN: param1 = Double.parseDouble(xml.getAttribute("mean")); param2 = Double.parseDouble(xml.getAttribute("stdDev")); sampler = new ContinuousSampler(new NormalDistributionImpl(param1, param2)); break; case STDNORMAL: param1 = 0; param2 = 1; sampler = new ContinuousSampler(new NormalDistributionImpl(param1, param2)); break; case LOGNORMAL: param1 = Double.parseDouble(xml.getAttribute("expMean")); param2 = Double.parseDouble(xml.getAttribute("expStdDev")); sampler = new LogNormalSampler(param1, param2); break; case CAUCHY: param1 = Double.parseDouble(xml.getAttribute("median")); param2 = Double.parseDouble(xml.getAttribute("scale")); sampler = new ContinuousSampler(new CauchyDistributionImpl(param1, param2)); break; case BETA: param1 = Double.parseDouble(xml.getAttribute("alpha")); param2 = Double.parseDouble(xml.getAttribute("beta")); sampler = new ContinuousSampler(new BetaDistributionImpl(param1, param2)); break; case BINOMIAL: param1 = Double.parseDouble(xml.getAttribute("trials")); param2 = Double.parseDouble(xml.getAttribute("success")); sampler = new DiscreteSampler(new BinomialDistributionImpl((int) param1, param2)); break; case POISSON: param1 = Double.parseDouble(xml.getAttribute("lambda")); sampler = new DiscreteSampler(new PoissonDistributionImpl(param1)); break; case CHISQUARED: param1 = Double.parseDouble(xml.getAttribute("dof")); sampler = new ContinuousSampler(new ChiSquaredDistributionImpl(param1)); break; case EXPONENTIAL: param1 = Double.parseDouble(xml.getAttribute("mean")); sampler = new ContinuousSampler(new ExponentialDistributionImpl(param1)); break; case GAMMA: param1 = Double.parseDouble(xml.getAttribute("alpha")); param2 = Double.parseDouble(xml.getAttribute("beta")); sampler = new ContinuousSampler(new GammaDistributionImpl(param1, param2)); break; case WEIBULL: param1 = Double.parseDouble(xml.getAttribute("alpha")); param2 = Double.parseDouble(xml.getAttribute("beta")); sampler = new ContinuousSampler(new WeibullDistributionImpl(param1, param2)); break; case STUDENT: param1 = Double.parseDouble(xml.getAttribute("dof")); sampler = new ContinuousSampler(new TDistributionImpl(param1)); break; case SNEDECOR: param1 = Double.parseDouble(xml.getAttribute("d1")); param2 = Double.parseDouble(xml.getAttribute("d2")); sampler = new ContinuousSampler(new FDistributionImpl(param1, param2)); break; default: throw new Error("Invalid probability distribution type!"); } sampler.reseedRandomGenerator(service.getRandomSeedRepo() .getRandomSeed("factoredcustomer.ProbabilityDistribution", SeedIdGenerator.getId(), "Sampler") .getValue()); }
From source file:geogebra.kernel.statistics.AlgoDistribution.java
ChiSquaredDistribution getChiSquaredDistribution(double param) { if (chisquared == null) chisquared = new ChiSquaredDistributionImpl(param); else {//from w ww. ja v a2s . com chisquared.setDegreesOfFreedom(param); } return chisquared; }
From source file:de.dfki.madm.anomalydetection.evaluator.cluster_based.CMGOSEvaluator.java
/** * Main Algorithm// w w w. jav a 2 s. c o m * @throws OperatorException */ public double[] evaluate() throws OperatorException { // remove small clusters boolean[] removed_cluster = new boolean[this.centroids.length]; double limit = percentage * points.length / centroids.length; removed_cluster = this.reassignPoints(removed_cluster, limit); int TotalNumberOfPoints = points.length; int NumberOfCluster = this.centroids.length; int PointDimension = this.points[0].length; // remove clusters with less points than dimensions removed_cluster = this.reassignPoints(removed_cluster, PointDimension); int[][] remove = new int[NumberOfCluster][PointDimension]; // assign distance limit -1 for error double DistanceLimit = -1; ChiSquaredDistributionImpl chi = new ChiSquaredDistributionImpl(points[0].length); try { DistanceLimit = chi.inverseCumulativeProbability(this.probability); } catch (MathException e) { System.out.println(e); } /* compute anomaly score */ double[] result = new double[TotalNumberOfPoints]; int[] workBelongsToCluster = this.belongsToCluster.clone(); int[] workClusterSize = this.clusterSize.clone(); double[] DistanceLimitPerCluster = new double[NumberOfCluster]; Arrays.fill(DistanceLimitPerCluster, DistanceLimit); this.CovariancematrixPerCluster = new CovarianceMatrix[NumberOfCluster]; // in case of fastMCD make sure don't remove any outliers and recompute // sanity check from user interface if (this.red == METHOD_COV_MCD) this.removeRuns = 0; for (int rem = 0; rem <= this.removeRuns; rem++) { // Associate instances to a cluster double[][][] ClusterWithPointsAssociation = new double[NumberOfCluster][][]; int[] nextId = new int[NumberOfCluster]; for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) { ClusterWithPointsAssociation[ClusterId] = new double[workClusterSize[ClusterId]][PointDimension]; } for (int PointId = 0; PointId < TotalNumberOfPoints; PointId++) { int ClusterId = workBelongsToCluster[PointId]; if (ClusterId < NumberOfCluster) { ClusterWithPointsAssociation[ClusterId][nextId[ClusterId]] = this.points[PointId]; nextId[ClusterId]++; } } // Subtract mean from all if (rem == 0) { for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) { double[] erw = new double[PointDimension]; for (int PointId = 0; PointId < ClusterWithPointsAssociation[ClusterId].length; PointId++) { for (int PointAttribute = 0; PointAttribute < ClusterWithPointsAssociation[ClusterId][PointId].length; PointAttribute++) { erw[PointAttribute] += ClusterWithPointsAssociation[ClusterId][PointId][PointAttribute]; } } for (int j1 = 0; j1 < erw.length; j1++) { erw[j1] = 1.0 / ClusterWithPointsAssociation[ClusterId].length * erw[j1]; } for (int PointId = 0; PointId < ClusterWithPointsAssociation[ClusterId].length; PointId++) { for (int j1 = 0; j1 < ClusterWithPointsAssociation[ClusterId][PointId].length; j1++) { ClusterWithPointsAssociation[ClusterId][PointId][j1] -= erw[j1]; } } } } // Calculate covariance for each cluster for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) { if (workClusterSize[ClusterId] > 0) { double[][] data = null; // use all data instances if (this.red == METHOD_COV_MCD || this.cov_sampling == -1 || this.cov_sampling > ClusterWithPointsAssociation[ClusterId].length) { data = ClusterWithPointsAssociation[ClusterId]; } // sample data else { data = new double[this.cov_sampling][ClusterWithPointsAssociation[ClusterId][0].length]; int i = 0; for (Integer index : generator.nextIntSetWithRange(0, ClusterWithPointsAssociation[ClusterId].length, this.cov_sampling)) { data[i] = ClusterWithPointsAssociation[ClusterId][index]; i++; } } // in the case of MCD, do it if (this.red == METHOD_COV_MCD) { // we compute h from the normal probability if (this.h == -1) this.h = (int) Math.ceil(this.probability * (float) data.length); CovariancematrixPerCluster[ClusterId] = fastMDC(data, this.h); } // Regularization and Reduction else { if (CovariancematrixPerCluster[ClusterId] == null || rem < this.removeRuns) { boolean change = false; int count = 0; // Reduction Method if (this.red == METHOD_COV_REDUCTION) { do { change = false; int ind = -1; // look for attribute with only one value for (int i = 0; i < data[0].length; i++) { change = true; ind = i; for (int j = 0; j < data.length; j++) { if (data[j][ind] != data[0][ind]) { change = false; ind = -1; break; } } if (change) break; } if (change) { // store which attribute to remove in which cluster remove[ClusterId][ind + count] = 1; count++; double[][] dataNew = new double[data.length][data[0].length - 1]; for (int i = 0; i < data.length; i++) { System.arraycopy(data[i], 0, dataNew[i], 0, ind); System.arraycopy(data[i], ind + 1, dataNew[i], ind, data[0].length - (ind + 1)); } data = dataNew; } } while (change); // calculate new distancelimit using new number of dimension chi = new ChiSquaredDistributionImpl(data[0].length); try { DistanceLimitPerCluster[ClusterId] = chi .inverseCumulativeProbability(this.probability); } catch (MathException e) { System.out.println(e); } } CovariancematrixPerCluster[ClusterId] = new CovarianceMatrix(data, numberOfThreads); } } } } // REGULARIZATION // S is the summarized covariance matrics (QDA) double[][] S = null; boolean thereisone = false; if (this.red == METHOD_COV_REGULARIZE) { int id = 0; for (boolean b : removed_cluster) { if (!b) { thereisone = true; break; } id++; } if (!thereisone) { throw new OperatorException( "No cluster left. This is a problem. Try not to remove small clusters or reduce number of clusters."); } S = new double[CovariancematrixPerCluster[id] .getCovMat().length][CovariancematrixPerCluster[id].getCovMat()[0].length]; for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) { if (!removed_cluster[ClusterId] && CovariancematrixPerCluster[ClusterId] != null) { double[][] d = CovariancematrixPerCluster[ClusterId].getCovMat(); for (int i = 0; i < d.length; i++) { for (int j = 0; j < d[i].length; j++) { S[i][j] += d[i][j]; } } } } } // reset Point-association if (rem == this.removeRuns) { workClusterSize = this.clusterSize.clone(); workBelongsToCluster = this.belongsToCluster.clone(); } for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) { if (workClusterSize[ClusterId] > 0) { Matrix mh = new Matrix(CovariancematrixPerCluster[ClusterId].getCovMat()); if (this.red == METHOD_COV_REDUCTION && mh.det() == 0) { CovariancematrixPerCluster[ClusterId].addMinimum(); mh = new Matrix(CovariancematrixPerCluster[ClusterId].getCovMat()); } else if (this.red == METHOD_COV_REGULARIZE) { Matrix mS = new Matrix(S); mS = mS.times(this.regularizedLambda / this.points.length); mh = mh.times((1.0 - this.regularizedLambda)); mh = mh.plus(mS); } // This shouldn't happen ... if (mh.det() == 0) { CovariancematrixPerCluster[ClusterId].addMinimum(); mh = new Matrix(CovariancematrixPerCluster[ClusterId].getCovMat()); } mh = mh.inverse(); for (int PointId = 0; PointId < points.length; PointId++) { if (workBelongsToCluster[PointId] == ClusterId) { int sum = 0; for (int i : remove[ClusterId]) sum += i; double[] point = new double[points[PointId].length - sum]; int count = 0; for (int ind = 0; ind < remove[ClusterId].length; ind++) { if (remove[ClusterId][ind] == 1) count++; int newid = ind - count; if (newid < 0) continue; point[newid] = this.points[PointId][newid]; } double mahaDist; if (this.red == 0) mahaDist = mahalanobisDistance(point, mh); else mahaDist = mahalanobisDistance(this.points[PointId], mh); result[PointId] = mahaDist / DistanceLimit; // remove association for minimum covariance // determinant if (rem != this.removeRuns && mahaDist > DistanceLimitPerCluster[ClusterId]) { workBelongsToCluster[PointId] = NumberOfCluster; workClusterSize[ClusterId]--; } } } } } } return result; }
From source file:geogebra.common.kernel.statistics.AlgoDistribution.java
/** * @param param/*from ww w. j ava2s . com*/ * degrees of freedom * @return chi squared distribution */ protected ChiSquaredDistribution getChiSquaredDistribution(double param) { if (chisquared == null || chisquared.getDegreesOfFreedom() != param) chisquared = new ChiSquaredDistributionImpl(param); return chisquared; }
From source file:de.dfki.madm.anomalydetection.operator.statistical_based.RobustPCAOperator.java
@Override public void doWork() throws OperatorException { // check whether all attributes are numerical ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class); Tools.onlyNonMissingValues(exampleSet, "PCA"); Tools.onlyNumericalAttributes(exampleSet, "PCA"); // Get normal probability. double normProb = getParameterAsDouble(PARAMETER_OUTLIER_PROBABILITY); int olInst = exampleSet.size() - (int) Math.floor(exampleSet.size() * normProb); log("Ignoring " + olInst + " anomalyous instances for robustness."); // The robust estimate is based on removing top outliers first based on Mahalanobis distance (MD). // Since MD is the same as the outlier score when using all PCs, the PCA is done twice: // First with all examples, second with top-outliers removed (robust) // First PCA for outlier removal // create covariance matrix Matrix covarianceMatrix = CovarianceMatrix.getCovarianceMatrix(exampleSet); // EigenVector and EigenValues of the covariance matrix EigenvalueDecomposition eigenvalueDecomposition = covarianceMatrix.eig(); // create and deliver results double[] eigenvalues = eigenvalueDecomposition.getRealEigenvalues(); Matrix eigenvectorMatrix = eigenvalueDecomposition.getV(); double[][] eigenvectors = eigenvectorMatrix.getArray(); PCAModel model = new PCAModel(exampleSet, eigenvalues, eigenvectors); // Perform transformation ExampleSet res = model.apply((ExampleSet) exampleSet.clone()); // Compute simple list with MDs and sort according to MD. List<double[]> l = new LinkedList<double[]>(); double eIdx = 0; for (Example example : res) { double md = 0.0; int aNr = 0; for (Attribute attr : example.getAttributes()) { double pcscore = example.getValue(attr); md += (pcscore * pcscore) / model.getEigenvalue(aNr); aNr++;/*from w ww . j a v a 2 s . c o m*/ } double[] x = { md, eIdx }; l.add(x); eIdx++; } Collections.sort(l, new Comparator<double[]>() { public int compare(double[] first, double[] second) { return Double.compare(second[0], first[0]); } }); // Out of the list, create array with outlier-indexes and array (mapping) with good instances. Iterator<double[]> iter = l.iterator(); int[] olMapping = new int[olInst]; for (int i = 0; i < olInst; i++) { olMapping[i] = (int) ((double[]) iter.next())[1]; } Arrays.sort(olMapping); int[] mapping = new int[exampleSet.size() - olInst]; int olc = 0; int ctr = 0; for (int i = 0; i < exampleSet.size(); i++) { if (olc == olInst) { // Add last elements after last outlier mapping[ctr++] = i; continue; } if (olMapping[olc] != i) { mapping[ctr++] = i; } else { olc++; } } ExampleSet robustExampleSet = new MappedExampleSet(exampleSet, mapping); // creates a new example set without the top outliers. // --- // Second PCA (robust) covarianceMatrix = CovarianceMatrix.getCovarianceMatrix(robustExampleSet); eigenvalueDecomposition = covarianceMatrix.eig(); // create and deliver results eigenvalues = eigenvalueDecomposition.getRealEigenvalues(); eigenvectorMatrix = eigenvalueDecomposition.getV(); eigenvectors = eigenvectorMatrix.getArray(); // Apply on original set model = new PCAModel(exampleSet, eigenvalues, eigenvectors); // Perform transformation res = model.apply((ExampleSet) exampleSet.clone()); // Sort eigenvalues Arrays.sort(eigenvalues); ArrayUtils.reverse(eigenvalues); // if necessary reduce nbr of dimensions ... int reductionType = getParameterAsInt(PARAMETER_REDUCTION_TYPE); List<Integer> pcList = new ArrayList<Integer>(); if (reductionType == PCS_ALL) { for (int i = 0; i < exampleSet.getAttributes().size(); i++) { pcList.add(i); } } if (reductionType == PCS_TOP || reductionType == PCS_BOTH) { //top switch (getParameterAsInt(PARAMETER_TOP_METHODS)) { case PCS_TOP_FIX: for (int i = 0; i < getParameterAsInt(PARAMETER_NUMBER_OF_COMPONENTS_TOP); i++) { pcList.add(i); } break; case PCS_TOP_VAR: double var = getParameterAsDouble(PARAMETER_VARIANCE_THRESHOLD); boolean last = false; for (int i = 0; i < exampleSet.getAttributes().size(); i++) { if (model.getCumulativeVariance(i) < var) { pcList.add(i); } else if (!last) { // we need to add another PC to meet the minimum requirement. last = true; pcList.add(i); } } break; } } if (reductionType == PCS_LOWER || reductionType == PCS_BOTH) { //lower switch (getParameterAsInt(PARAMETER_LOW_METHODS)) { case PCS_LOW_FIX: for (int i = exampleSet.getAttributes().size() - getParameterAsInt(PARAMETER_NUMBER_OF_COMPONENTS_LOW); i < exampleSet.getAttributes() .size(); i++) { pcList.add(i); } break; case PCS_LOW_VAL: double val = getParameterAsDouble(PARAMETER_VALUE_THRESHOLD); for (int i = 0; i < eigenvalues.length; i++) { if (eigenvalues[i] <= val) { if (pcList.size() == 0) { pcList.add(i); } else if (pcList.get(pcList.size() - 1).intValue() < i) { pcList.add(i); } } } break; } } int[] opcs = ArrayUtils.toPrimitive(pcList.toArray(new Integer[pcList.size()])); if (opcs.length == 0) { throw new UserError(this, "Parameters thresholds are selected such that they did not match any principal component. Lower variance or increase eigenvalue threshold."); } if (opcs.length == exampleSet.getAttributes().size()) { log("Using all PCs for score."); } else { log("Using following PCs for score: " + Arrays.toString(opcs)); } // Normalize by Chi-Dist with d degrees of freedom double scoreNormalizer = 1.0; ChiSquaredDistributionImpl chi = new ChiSquaredDistributionImpl(opcs.length); try { scoreNormalizer = chi.inverseCumulativeProbability(normProb); } catch (MathException e) { System.err.println(e); } log("Normalizing score with chi cumulative propability: " + scoreNormalizer); // compute scores Attribute scoreAttr = AttributeFactory.createAttribute("outlier", Ontology.REAL); exampleSet.getExampleTable().addAttribute(scoreAttr); exampleSet.getAttributes().setOutlier(scoreAttr); for (int exNr = 0; exNr < exampleSet.size(); exNr++) { Example orig = exampleSet.getExample(exNr); Example pc = res.getExample(exNr); double oscore = 0.0; int aNr = 0; ctr = 0; for (Attribute attr : pc.getAttributes()) { if (ctr < opcs.length && opcs[ctr] != aNr) { // we skip this dimension aNr++; continue; } double pcscore = pc.getValue(attr); oscore += (pcscore * pcscore) / model.getEigenvalue(aNr); aNr++; ctr++; } orig.setValue(scoreAttr, oscore / scoreNormalizer); } exampleSetOutput.deliver(exampleSet); }
From source file:desmoj.core.statistic.Histogram.java
/** * Performs Pearson's Chi-square test on given frequencies, degrees of freedom and desired probability. * The frequencies are given in an array either including under- and overflow cells or not. * On error the Chi-squared test is not performed and false is returned. * Details on errors are given out in the error message log. * The result is true if the null hypothesis can not be rejected. * * @param values/* w w w. ja v a 2 s .c om*/ * long[]: Array of assumed frequencies for each cell. * @param degFreedom * int: Degrees of freedom of the test. * @param confidence * double: (1-alpha) probability level. * * @return boolean : <code>true</code> if the the null hypothesis can not be rejected. * <code>false</code> on error or if the null hypothesis has to be rejected. */ public boolean chiSquareTest(long[] values, int degFreedom, double confidence) { // check if number of cells is valid if (!((values.length == this.getCells() + 2) || (values.length == this.getCells()) || (values.length > 1))) { sendWarning("Attempt to perform a Chi-squared test on an invalid number of cells! ", "chiSquareTest(long[], int, double) : ", "Too few or too many cells. ", "Make sure to have a valid number of cells " + "when calling the chiSquareTest() method. "); return false; } // check if we have a reasonable amount of observations else if (this.getObservations() < 3) { sendWarning( "Attempt to perform a Chi-squared test on an insufficient data amount, " + "there are less than three observations! ", "chiSquareTest(long[], int, double) : ", "Too few observations. ", "Make sure to have a sufficient amount of observations " + "when calling the chiSquareTest() method. "); return false; } // check if we have a reasonable probability else if (confidence < 0 || confidence > 1) { sendWarning("Attempt to perform a Chi-squared test with an illegal desired probability! ", "chiSquareTest(long[], int, double) : ", "Illegal desired probability. ", "Make sure to have a valid desired probability " + "when calling the chiSquareTest() method. "); return false; } // check if degree of freedom is valid else if (degFreedom <= 0 || degFreedom >= values.length) { sendWarning("Attempt to perform a Chi-squared test with an illegal degree of freedom! ", "chiSquareTest(long[], int, double) : ", "Illegal degree of freedom. ", "Make sure to have a valid degree of freedom " + "when calling the chiSquareTest() method. "); return false; } else { long sumValuesObserved = 0; long sumValuesExpected = 0; // summarize expected values double[] expectedEntries = new double[values.length]; for (double val : values) { if (val == 0) { // exit on 0 to avoid zero division sendWarning("Attempt to perform a Chi-squared test with an expected value of 0! ", "chiSquareTest(long[], int, double) : ", "Invalid expected value. ", "Make sure to have a set of valid expected values " + "when calling the chiSquareTest() method. "); return false; } else { sumValuesExpected += val; } } // summarize observed values int cell; if (values.length == this.getCells()) { // without under- and overflow cell = 1; } else { cell = 0; // with under- and overflow } for (int i = cell; i < values.length + cell; i++) { sumValuesObserved += this.getObservationsInCell(i); } // expected frequency for (int i = 0; i < values.length; i++) { expectedEntries[i] = (double) (values[i]) / (double) (sumValuesExpected) * sumValuesObserved; } // calculation of chiSquared double testStat = 0; for (int i = 0; i < values.length; i++) { testStat += Math.pow((this.getObservationsInCell(cell)) - expectedEntries[i], 2) / expectedEntries[i]; cell++; } // chiSquared for degrees of freedom and probability level ChiSquaredDistributionImpl chiSquared = new ChiSquaredDistributionImpl(degFreedom); // comparison boolean result = false; try { result = !(testStat > chiSquared.inverseCumulativeProbability(confidence)); } catch (MathException e) { e.printStackTrace(); } // trace not does not make much sense as the test is performed after the simulation has finished. // sendTraceNote("result of chi-squared test for " + this.getQuotedName() + "is being returned. "); return result; } }
From source file:org.apache.mahout.freqtermsets.fpgrowth.FPGrowth.java
/** * Internal TopKFrequentPattern Generation algorithm, which represents the * A's as integers and transforms features to use only integers * /* w w w. j ava2 s . c o m*/ * @param transactions * Transaction database Iterator * @param attributeFrequency * array representing the Frequency of the corresponding * attribute id * @param minSupport * minimum support of the pattern to be mined * @param k * Max value of the Size of the Max-Heap in which Patterns are * held * @param featureSetSize * number of features * @param returnFeatures * the id's of the features for which Top K patterns have to be * mined * @param topKPatternsOutputCollector * the outputCollector which transforms the given Pattern in * integer format to the corresponding A Format */ private void generateTopKFrequentPatterns( // Iterator<Pair<int[], Long>> transactions, TransactionTree cTree, OpenObjectIntHashMap<A> attributeIdMapping, long[] attributeFrequency, long minSupport, int k, int featureSetSize, Collection<Integer> returnFeatures, TopKPatternsOutputConverter<A> topKPatternsOutputCollector, StatusUpdater updater) throws IOException { // YA: BONSAAAAAAAII { // FPTree tree = new FPTree(featureSetSize); FPTree tree = null; boolean change = true; int pruneIters = 0; IntArrayList pruneByContingencyCount = new IntArrayList(); IntArrayList pruneBySpreadCount = new IntArrayList(); while (change) { pruneByContingencyCount.add(0); pruneBySpreadCount.add(0); change = false; tree = new FPTree(featureSetSize); OpenIntLongHashMap[] childJointFreq; long[] sumChildSupport; if (BONSAI_PRUNE) { childJointFreq = new OpenIntLongHashMap[featureSetSize]; sumChildSupport = new long[featureSetSize]; } double supportGrandTotal = 0; // } YA: BONSAAAAAAAII for (int i = 0; i < featureSetSize; i++) { tree.addHeaderCount(i, attributeFrequency[i]); // YA: BONSAAAAAAAII { if (attributeFrequency[i] < 0) { continue; // this is an attribute not satisfying the // monotone constraint } if (BONSAI_PRUNE) { childJointFreq[i] = new OpenIntLongHashMap(); supportGrandTotal += attributeFrequency[i]; } // } YA: Bonsai } // Constructing initial FPTree from the list of transactions // YA Bonsai : To pass the tree itself the iterator now would work // only with ints.. the A type argument is // not checked in the constructor. TOD: remove the type argument and // force using ints only Iterator<Pair<int[], Long>> transactions = new IntTransactionIterator(cTree.iterator(), attributeIdMapping); int nodecount = 0; // int attribcount = 0; int i = 0; while (transactions.hasNext()) { Pair<int[], Long> transaction = transactions.next(); Arrays.sort(transaction.getFirst()); // attribcount += transaction.length; // YA: Bonsai { // nodecount += treeAddCount(tree, transaction.getFirst(), // transaction.getSecond(), minSupport, attributeFrequency); int temp = FPTree.ROOTNODEID; boolean addCountMode = true; for (int attribute : transaction.getFirst()) { if (attributeFrequency[attribute] < 0) { continue; // this is an attribute not satisfying the // monotone constraint } if (attributeFrequency[attribute] < minSupport) { break; } if (BONSAI_PRUNE && tree.attribute(temp) != -1) { // Root node childJointFreq[tree.attribute(temp)].put(attribute, childJointFreq[tree.attribute(temp)].get(attribute) + transaction.getSecond()); sumChildSupport[tree.attribute(temp)] += transaction.getSecond(); } int child; if (addCountMode) { child = tree.childWithAttribute(temp, attribute); if (child == -1) { addCountMode = false; } else { tree.addCount(child, transaction.getSecond()); temp = child; } } if (!addCountMode) { child = tree.createNode(temp, attribute, transaction.getSecond()); temp = child; nodecount++; } } // } YA Bonsai i++; if (i % 10000 == 0) { log.info("FPTree Building: Read {} Transactions", i); } } log.info("Number of Nodes in the FP Tree: {}", nodecount); // YA: BONSAAAAAAAII { if (BONSAI_PRUNE) { if (log.isTraceEnabled()) log.info("Bonsai prunining tree: {}", tree.toString()); for (int a = 0; a < tree.getHeaderTableCount(); ++a) { int attr = tree.getAttributeAtIndex(a); if (attributeFrequency[attr] < 0) { continue; // this is an attribute not satisfying the // monotone constraint } if (attributeFrequency[attr] < minSupport) { break; } // if (sumChildSupport[attr] < attributeFrequency[attr]) { // // the case of . (full stop) as the next child // childJointFreq[attr] // .put(-1, // (long) (attributeFrequency[attr] - sumChildSupport[attr])); // } float numChildren = childJointFreq[attr].size(); // if (numChildren < LEAST_NUM_CHILDREN_TO_VOTE_FOR_NOISE) { // continue; // } if (log.isTraceEnabled()) { log.trace("Voting for noisiness of attribute {} with number of children: {}", attr, numChildren); log.trace("Attribute support: {} - Total Children support: {}", attributeFrequency[attr], sumChildSupport[attr]); } // EMD and the such.. the threshold isn't easy to define, and it // also doesn't take into account the weights of children. // // double uniformProb = 1.0 / numChildren; // // double uniformProb = sumChildSupport[attr] / // supportGrandTotal; // double uniformFreq = attributeFrequency[attr] / numChildren; // IntArrayList childAttrArr = childJointFreq[attr].keys(); // // IntArrayList childAttrArr = new IntArrayList(); // // childJointFreq[attr].keysSortedByValue(childAttrArr); // double totalDifference = 0; // double sumOfWeights = 0; // // double emd = 0; // for (int c = childAttrArr.size() - 1; c >=0 ; --c) { // int childAttr = childAttrArr.get(c); // double childJF = childJointFreq[attr].get(childAttr); // double childWeight = attributeFrequency[childAttr]; // totalDifference += childWeight * Math.abs(childJF - // uniformFreq); // sumOfWeights += childWeight; // // // double jointProb = childJF / // // supportGrandTotal; // // double childProb = attributeFrequency[childAttr] / // // supportGrandTotal; // // double childConditional = childJF / // attributeFrequency[attr]; // // emd = childConditional + emd - uniformProb; // // emd = childJF + emd - uniformFreq; // // totalDifference += Math.abs(emd); // } // // Probability (D > observed ) = QKS Ne + 0.12 + 0.11/ Ne D // // double pNotUniform = totalDifference / attrSupport; // // double threshold = (numChildren * (numChildren - 1) * 1.0) // // / (2.0 * attributeFrequency[attr]); // double weightedDiff = totalDifference / sumOfWeights; // double threshold = sumOfWeights / 2.0; // each child can be // up to // // 1 over or below the // // uniform freq // boolean noise = weightedDiff < threshold; // log.info("EMD: {} - Threshold: {}", weightedDiff, threshold); // /////////////////////////////////// // Log odds.. this is my hartala, and it needs ot be shifted // according to the number of children // // // if there is one child then the prob of random choice // // will be // // // 1, so anything would be // // // noise // // // and if there are few then the probability that this is // // // actually noise declines // // if (numChildren >= LEAST_NUM_CHILDREN_TO_VOTE_FOR_NOISE) // // { // // log.info( // // // "Voting for noisiness of attribute {} with number of children: {}", // // currentAttribute, numChildren); // // log.info( // // "Attribute support: {} - Total Children support: {}", // // attrSupport, sumOfChildSupport); // // int noiseVotes = 0; // // double randomSelectionLogOdds = 1.0 / numChildren; // // randomSelectionLogOdds = Math.log(randomSelectionLogOdds // // / (1 - randomSelectionLogOdds)); // // randomSelectionLogOdds = // // Math.abs(randomSelectionLogOdds); // // // // IntArrayList childAttrArr = childJointFreq.keys(); // // for (int c = 0; c < childAttrArr.size(); ++c) { // // double childConditional = 1.0 // // * childJointFreq.get(childAttrArr.get(c)) // // / sumOfChildSupport; // attrSupport; // // double childLogOdds = Math.log(childConditional // // / (1 - childConditional)); // // if (Math.abs(childLogOdds) <= randomSelectionLogOdds) { // // // probability of the child given me is different // // // than // // // probability of choosing the // // // child randomly // // // from among my children.. using absolute log odds // // // because they are symmetric // // ++noiseVotes; // // } // // } // // log.info("Noisy if below: {} - Noise votes: {}", // // randomSelectionLogOdds, noiseVotes); // // noise = noiseVotes == numChildren; // //////////////////////////////////////////////////// // // Kullback-liebler divergence from the uniform distribution // double randomChild = 1.0 / numChildren; // IntArrayList childAttrArr = childJointFreq[attr].keys(); // // double klDivergence = 0; // for (int c = 0; c < childAttrArr.size(); ++c) { // double childConditional = 1.0 // * childJointFreq[attr].get(childAttrArr.get(c)) // / attributeFrequency[attr]; // if (childConditional == 0) { // continue; // a7a! // } // klDivergence += childConditional // * Math.log(childConditional / randomChild); // } // // boolean noise = Math.abs(klDivergence) < 0.05; // log.info("KL-Divergence: {} - Noise less than: {}", // klDivergence, 0.05); // ////////////////////////////////////// // Pair wise metric with different children SummaryStatistics metricSummary = new SummaryStatistics(); // double[] metric = new double[(int) numChildren]; // SummaryStatistics spreadSummary = new SummaryStatistics(); // double uniformSpread = attributeFrequency[attr] / // numChildren; double goodnessOfFit = 0.0; // If I don't take the . into account: sumChildSupport[attr] / // numChildren; double sumOfWeights = 0; IntArrayList childAttrArr = childJointFreq[attr].keys(); for (int c = 0; c < childAttrArr.size(); ++c) { int childAttr = childAttrArr.get(c); double[][] contingencyTable = new double[2][2]; if (childAttr == -1) { // this is meaningless, as yuleq will just be 1 contingencyTable[1][1] = childJointFreq[attr].get(childAttr); contingencyTable[1][0] = sumChildSupport[attr]; // equals attributeFrequency[attr] - // contingencyTable[1][1]; contingencyTable[0][1] = 0; contingencyTable[0][0] = supportGrandTotal - attributeFrequency[attr]; } else { contingencyTable[1][1] = childJointFreq[attr].get(childAttr); contingencyTable[1][0] = attributeFrequency[attr] - contingencyTable[1][1]; contingencyTable[0][1] = attributeFrequency[childAttr] - contingencyTable[1][1]; contingencyTable[0][0] = supportGrandTotal - attributeFrequency[attr] - attributeFrequency[childAttr] + contingencyTable[1][1]; // because of the meninglessness of yuleq in case of . } double ad = contingencyTable[0][0] * contingencyTable[1][1]; double bc = contingencyTable[0][1] * contingencyTable[1][0]; double yuleq = (ad - bc) / (ad + bc); double weight = attributeFrequency[childAttr]; sumOfWeights += weight; metricSummary.addValue(Math.abs(yuleq * weight)); // metricSummary.addValue(yuleq * yuleq * weight); } // spreadSummary.addValue(Math.abs(uniformSpread // - contingencyTable[1][1]) // / numChildren); // spreadSummary.addValue(contingencyTable[1][1]); // * // weight goodnessOfFit += contingencyTable[1][1] * contingencyTable[1][1]; } // double weightedquadraticMean = // Math.sqrt(metricSummary.getSum() / sumOfWeights); double weightedMean = (metricSummary.getSum() / sumOfWeights); boolean noise = false; // if (weightedMean < 0.5) { // pruneByContingencyCount.set(pruneIters, pruneByContingencyCount.get(pruneIters) + 1); // noise = true; // } else if (weightedMean < 0.95) { if (numChildren > 1) { double n = sumChildSupport[attr]; // attributeFrequency[attr]; goodnessOfFit /= (n / numChildren); goodnessOfFit -= n; ChiSquaredDistributionImpl chisqDist = new ChiSquaredDistributionImpl(numChildren - 1); double criticalPoint = -1; try { criticalPoint = chisqDist.inverseCumulativeProbability(1.0 - SIGNIFICANCE / 2.0); } catch (MathException e) { log.error(e.getMessage(), e); } if (goodnessOfFit < criticalPoint) { pruneBySpreadCount.set(pruneIters, pruneBySpreadCount.get(pruneIters) + 1); noise = true; } // // double spreadCentraltendency = (spreadSummary.getMax() // - // // spreadSummary.getMin()) / 2.0; // // spreadSummary.getMean(); // // double uniformSpread = sumChildSupport[attr] / // // numChildren; // // // noise = Math.abs(spreadCentraltendency - // uniformSpread) < // // 1e-4; // // double spreadCentraltendency = spreadSummary.getMean(); // // (spreadSummary.getMax() - // // spreadSummary.getMin()) / 2.0; // if(spreadCentraltendency < 1e-6){ // noise = true; // } // // if (!noise && numChildren > 0) { // // see if the difference is statitically significant // double spreadCI = getConfidenceIntervalHalfWidth( // spreadSummary, SIGNIFICANCE); // spreadCentraltendency -= spreadCI; // if (spreadCentraltendency < 0) { // noise = true; // } // // // noise if the CI contains the uniform spread // // threshold // // if (spreadCentraltendency > uniformSpread) { // // noise = (spreadCentraltendency - spreadCI) < // // uniformSpread; // // } else { // // noise = (spreadCentraltendency + spreadCI) > // // uniformSpread; // // } // } } change |= noise; if (noise) { if (log.isTraceEnabled()) log.info("Pruning attribute {} with child joint freq {}", attr, childJointFreq[attr]); returnFeatures.remove(attr); attributeFrequency[attr] = -1; } } } ++pruneIters; } if (log.isTraceEnabled()) { log.info("Pruned tree: {}", tree.toString()); log.info("Prune by contingency: {} - Prune by spread: {}", pruneByContingencyCount.toString(), pruneBySpreadCount.toString()); } // } YA: Bonsai fpGrowth(tree, minSupport, k, returnFeatures, topKPatternsOutputCollector, updater); }
From source file:org.caleydo.view.tourguide.impl.Statistics.java
public static double chiSquaredProbability(double x, int df) { // return weka.core.Statistics.chiSquaredProbability(x, df); ChiSquaredDistribution d;//from ww w .j a v a2 s . co m if (df == 1) { d = chiSquare1 != null ? chiSquare1.get() : null; if (d == null) { d = new ChiSquaredDistributionImpl(1); chiSquare1 = new SoftReference<>(d); } } else { d = new ChiSquaredDistributionImpl(df); } try { return 1.0 - d.cumulativeProbability(x); } catch (MathException e) { log.error("can't compute chiSquaredProbability of " + x + " with df: " + df, e); } return Float.NaN; }