Example usage for org.apache.commons.math.distribution ChiSquaredDistributionImpl inverseCumulativeProbability

List of usage examples for org.apache.commons.math.distribution ChiSquaredDistributionImpl inverseCumulativeProbability

Introduction

In this page you can find the example usage for org.apache.commons.math.distribution ChiSquaredDistributionImpl inverseCumulativeProbability.

Prototype

@Override
public double inverseCumulativeProbability(final double p) throws MathException 

Source Link

Document

For this distribution, X, this method returns the critical point x, such that P(X < x) = p.

Usage

From source file:de.dfki.madm.anomalydetection.operator.statistical_based.RobustPCAOperator.java

@Override
public void doWork() throws OperatorException {
    // check whether all attributes are numerical
    ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class);

    Tools.onlyNonMissingValues(exampleSet, "PCA");
    Tools.onlyNumericalAttributes(exampleSet, "PCA");

    // Get normal probability.
    double normProb = getParameterAsDouble(PARAMETER_OUTLIER_PROBABILITY);
    int olInst = exampleSet.size() - (int) Math.floor(exampleSet.size() * normProb);
    log("Ignoring " + olInst + " anomalyous instances for robustness.");

    // The robust estimate is based on removing top outliers first based on Mahalanobis distance (MD).
    // Since MD is the same as the outlier score when using all PCs, the PCA is done twice:
    // First with all examples, second with top-outliers removed (robust)

    // First PCA for outlier removal
    // create covariance matrix
    Matrix covarianceMatrix = CovarianceMatrix.getCovarianceMatrix(exampleSet);

    // EigenVector and EigenValues of the covariance matrix
    EigenvalueDecomposition eigenvalueDecomposition = covarianceMatrix.eig();

    // create and deliver results
    double[] eigenvalues = eigenvalueDecomposition.getRealEigenvalues();
    Matrix eigenvectorMatrix = eigenvalueDecomposition.getV();
    double[][] eigenvectors = eigenvectorMatrix.getArray();

    PCAModel model = new PCAModel(exampleSet, eigenvalues, eigenvectors);

    // Perform transformation
    ExampleSet res = model.apply((ExampleSet) exampleSet.clone());

    // Compute simple list with MDs and sort according to MD.
    List<double[]> l = new LinkedList<double[]>();
    double eIdx = 0;
    for (Example example : res) {
        double md = 0.0;
        int aNr = 0;
        for (Attribute attr : example.getAttributes()) {
            double pcscore = example.getValue(attr);
            md += (pcscore * pcscore) / model.getEigenvalue(aNr);
            aNr++;//w  ww . jav a  2  s. co  m
        }
        double[] x = { md, eIdx };
        l.add(x);
        eIdx++;
    }
    Collections.sort(l, new Comparator<double[]>() {
        public int compare(double[] first, double[] second) {
            return Double.compare(second[0], first[0]);
        }
    });
    // Out of the list, create array with outlier-indexes and array (mapping) with good instances. 
    Iterator<double[]> iter = l.iterator();
    int[] olMapping = new int[olInst];
    for (int i = 0; i < olInst; i++) {
        olMapping[i] = (int) ((double[]) iter.next())[1];
    }
    Arrays.sort(olMapping);
    int[] mapping = new int[exampleSet.size() - olInst];
    int olc = 0;
    int ctr = 0;
    for (int i = 0; i < exampleSet.size(); i++) {
        if (olc == olInst) { // Add last elements after last outlier
            mapping[ctr++] = i;
            continue;
        }
        if (olMapping[olc] != i) {
            mapping[ctr++] = i;
        } else {
            olc++;
        }
    }
    ExampleSet robustExampleSet = new MappedExampleSet(exampleSet, mapping); // creates a new example set without the top outliers.

    // ---
    // Second PCA (robust)
    covarianceMatrix = CovarianceMatrix.getCovarianceMatrix(robustExampleSet);
    eigenvalueDecomposition = covarianceMatrix.eig();

    // create and deliver results
    eigenvalues = eigenvalueDecomposition.getRealEigenvalues();
    eigenvectorMatrix = eigenvalueDecomposition.getV();
    eigenvectors = eigenvectorMatrix.getArray();

    // Apply on original set
    model = new PCAModel(exampleSet, eigenvalues, eigenvectors);

    // Perform transformation
    res = model.apply((ExampleSet) exampleSet.clone());

    // Sort eigenvalues
    Arrays.sort(eigenvalues);
    ArrayUtils.reverse(eigenvalues);

    // if necessary reduce nbr of dimensions ...
    int reductionType = getParameterAsInt(PARAMETER_REDUCTION_TYPE);
    List<Integer> pcList = new ArrayList<Integer>();
    if (reductionType == PCS_ALL) {
        for (int i = 0; i < exampleSet.getAttributes().size(); i++) {
            pcList.add(i);
        }
    }
    if (reductionType == PCS_TOP || reductionType == PCS_BOTH) {
        //top
        switch (getParameterAsInt(PARAMETER_TOP_METHODS)) {
        case PCS_TOP_FIX:
            for (int i = 0; i < getParameterAsInt(PARAMETER_NUMBER_OF_COMPONENTS_TOP); i++) {
                pcList.add(i);
            }
            break;
        case PCS_TOP_VAR:
            double var = getParameterAsDouble(PARAMETER_VARIANCE_THRESHOLD);
            boolean last = false;
            for (int i = 0; i < exampleSet.getAttributes().size(); i++) {
                if (model.getCumulativeVariance(i) < var) {
                    pcList.add(i);
                } else if (!last) { // we need to add another PC to meet the minimum requirement.
                    last = true;
                    pcList.add(i);
                }
            }
            break;
        }
    }
    if (reductionType == PCS_LOWER || reductionType == PCS_BOTH) {
        //lower
        switch (getParameterAsInt(PARAMETER_LOW_METHODS)) {
        case PCS_LOW_FIX:
            for (int i = exampleSet.getAttributes().size()
                    - getParameterAsInt(PARAMETER_NUMBER_OF_COMPONENTS_LOW); i < exampleSet.getAttributes()
                            .size(); i++) {
                pcList.add(i);
            }
            break;
        case PCS_LOW_VAL:
            double val = getParameterAsDouble(PARAMETER_VALUE_THRESHOLD);
            for (int i = 0; i < eigenvalues.length; i++) {
                if (eigenvalues[i] <= val) {
                    if (pcList.size() == 0) {
                        pcList.add(i);
                    } else if (pcList.get(pcList.size() - 1).intValue() < i) {
                        pcList.add(i);
                    }
                }
            }
            break;
        }
    }
    int[] opcs = ArrayUtils.toPrimitive(pcList.toArray(new Integer[pcList.size()]));

    if (opcs.length == 0) {
        throw new UserError(this,
                "Parameters thresholds are selected such that they did not match any principal component. Lower variance or increase eigenvalue threshold.");
    }
    if (opcs.length == exampleSet.getAttributes().size()) {
        log("Using all PCs for score.");
    } else {
        log("Using following PCs for score: " + Arrays.toString(opcs));
    }

    // Normalize by Chi-Dist with d degrees of freedom
    double scoreNormalizer = 1.0;
    ChiSquaredDistributionImpl chi = new ChiSquaredDistributionImpl(opcs.length);
    try {
        scoreNormalizer = chi.inverseCumulativeProbability(normProb);
    } catch (MathException e) {
        System.err.println(e);
    }
    log("Normalizing score with chi cumulative propability: " + scoreNormalizer);

    // compute scores
    Attribute scoreAttr = AttributeFactory.createAttribute("outlier", Ontology.REAL);
    exampleSet.getExampleTable().addAttribute(scoreAttr);
    exampleSet.getAttributes().setOutlier(scoreAttr);
    for (int exNr = 0; exNr < exampleSet.size(); exNr++) {
        Example orig = exampleSet.getExample(exNr);
        Example pc = res.getExample(exNr);
        double oscore = 0.0;
        int aNr = 0;
        ctr = 0;
        for (Attribute attr : pc.getAttributes()) {
            if (ctr < opcs.length && opcs[ctr] != aNr) { // we skip this dimension
                aNr++;
                continue;
            }
            double pcscore = pc.getValue(attr);
            oscore += (pcscore * pcscore) / model.getEigenvalue(aNr);
            aNr++;
            ctr++;
        }
        orig.setValue(scoreAttr, oscore / scoreNormalizer);
    }
    exampleSetOutput.deliver(exampleSet);
}

From source file:desmoj.core.statistic.Histogram.java

/**
 * Performs Pearson's Chi-square test on given frequencies, degrees of freedom and desired probability.
 * The frequencies are given in an array either including under- and overflow cells or not.
 * On error the Chi-squared test is not performed and false is returned.
 * Details on errors are given out in the error message log.
 * The result is true if the null hypothesis can not be rejected.
 *
 * @param values//w  ww .java2 s.co  m
 *          long[]: Array of assumed frequencies for each cell.
 * @param degFreedom
 *          int: Degrees of freedom of the test.
 * @param confidence
 *          double: (1-alpha) probability level.
 *
 * @return boolean : <code>true</code> if the the null hypothesis can not be rejected.
 *                   <code>false</code> on error or if the null hypothesis has to be rejected.
 */
public boolean chiSquareTest(long[] values, int degFreedom, double confidence) {

    // check if number of cells is valid
    if (!((values.length == this.getCells() + 2) || (values.length == this.getCells())
            || (values.length > 1))) {
        sendWarning("Attempt to perform a Chi-squared test on an invalid number of cells!  ",
                "chiSquareTest(long[], int, double) : ", "Too few or too many cells. ",
                "Make sure to have a valid number of cells " + "when calling the chiSquareTest() method.  ");
        return false;
    }

    // check if we have a reasonable amount of observations
    else if (this.getObservations() < 3) {
        sendWarning(
                "Attempt to perform a Chi-squared test on an insufficient data amount,  "
                        + "there are less than three observations!  ",
                "chiSquareTest(long[], int, double) : ", "Too few observations. ",
                "Make sure to have a sufficient amount of observations "
                        + "when calling the chiSquareTest() method.  ");
        return false;
    }

    // check if we have a reasonable probability
    else if (confidence < 0 || confidence > 1) {
        sendWarning("Attempt to perform a Chi-squared test with an illegal desired probability!  ",
                "chiSquareTest(long[], int, double) : ", "Illegal desired probability. ",
                "Make sure to have a valid desired probability "
                        + "when calling the chiSquareTest() method.  ");
        return false;
    }

    // check if degree of freedom is valid
    else if (degFreedom <= 0 || degFreedom >= values.length) {
        sendWarning("Attempt to perform a Chi-squared test with an illegal degree of freedom!  ",
                "chiSquareTest(long[], int, double) : ", "Illegal degree of freedom. ",
                "Make sure to have a valid degree of freedom " + "when calling the chiSquareTest() method.  ");
        return false;
    }

    else {
        long sumValuesObserved = 0;
        long sumValuesExpected = 0;

        // summarize expected values
        double[] expectedEntries = new double[values.length];
        for (double val : values) {
            if (val == 0) { // exit on 0 to avoid zero division
                sendWarning("Attempt to perform a Chi-squared test with an expected value of 0!  ",
                        "chiSquareTest(long[], int, double) : ", "Invalid expected value. ",
                        "Make sure to have a set of valid expected values "
                                + "when calling the chiSquareTest() method.  ");
                return false;
            } else {
                sumValuesExpected += val;
            }
        }
        // summarize observed values
        int cell;
        if (values.length == this.getCells()) { // without under- and overflow
            cell = 1;
        } else {
            cell = 0; // with under- and overflow
        }
        for (int i = cell; i < values.length + cell; i++) {
            sumValuesObserved += this.getObservationsInCell(i);
        }
        // expected frequency
        for (int i = 0; i < values.length; i++) {
            expectedEntries[i] = (double) (values[i]) / (double) (sumValuesExpected) * sumValuesObserved;
        }
        // calculation of chiSquared
        double testStat = 0;
        for (int i = 0; i < values.length; i++) {
            testStat += Math.pow((this.getObservationsInCell(cell)) - expectedEntries[i], 2)
                    / expectedEntries[i];
            cell++;
        }
        // chiSquared for degrees of freedom and  probability level
        ChiSquaredDistributionImpl chiSquared = new ChiSquaredDistributionImpl(degFreedom);

        // comparison
        boolean result = false;
        try {
            result = !(testStat > chiSquared.inverseCumulativeProbability(confidence));
        } catch (MathException e) {
            e.printStackTrace();
        }
        // trace not does not make much sense as the test is performed after the simulation has finished.
        // sendTraceNote("result of chi-squared test for " + this.getQuotedName() +  "is being returned. ");
        return result;
    }
}

From source file:de.dfki.madm.anomalydetection.evaluator.cluster_based.CMGOSEvaluator.java

/**
 * Main Algorithm/*from   ww  w . jav  a2  s  .  c  om*/
 * @throws OperatorException 
 */
public double[] evaluate() throws OperatorException {
    // remove small clusters
    boolean[] removed_cluster = new boolean[this.centroids.length];
    double limit = percentage * points.length / centroids.length;
    removed_cluster = this.reassignPoints(removed_cluster, limit);

    int TotalNumberOfPoints = points.length;
    int NumberOfCluster = this.centroids.length;
    int PointDimension = this.points[0].length;

    // remove clusters with less points than dimensions
    removed_cluster = this.reassignPoints(removed_cluster, PointDimension);
    int[][] remove = new int[NumberOfCluster][PointDimension];

    // assign distance limit -1 for error
    double DistanceLimit = -1;
    ChiSquaredDistributionImpl chi = new ChiSquaredDistributionImpl(points[0].length);
    try {
        DistanceLimit = chi.inverseCumulativeProbability(this.probability);
    } catch (MathException e) {
        System.out.println(e);
    }

    /* compute anomaly score */
    double[] result = new double[TotalNumberOfPoints];

    int[] workBelongsToCluster = this.belongsToCluster.clone();
    int[] workClusterSize = this.clusterSize.clone();

    double[] DistanceLimitPerCluster = new double[NumberOfCluster];
    Arrays.fill(DistanceLimitPerCluster, DistanceLimit);

    this.CovariancematrixPerCluster = new CovarianceMatrix[NumberOfCluster];

    // in case of fastMCD make sure don't remove any outliers and recompute
    // sanity check from user interface
    if (this.red == METHOD_COV_MCD)
        this.removeRuns = 0;

    for (int rem = 0; rem <= this.removeRuns; rem++) {

        // Associate instances to a cluster
        double[][][] ClusterWithPointsAssociation = new double[NumberOfCluster][][];
        int[] nextId = new int[NumberOfCluster];
        for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) {
            ClusterWithPointsAssociation[ClusterId] = new double[workClusterSize[ClusterId]][PointDimension];
        }

        for (int PointId = 0; PointId < TotalNumberOfPoints; PointId++) {
            int ClusterId = workBelongsToCluster[PointId];
            if (ClusterId < NumberOfCluster) {
                ClusterWithPointsAssociation[ClusterId][nextId[ClusterId]] = this.points[PointId];
                nextId[ClusterId]++;
            }
        }

        // Subtract mean from all
        if (rem == 0) {
            for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) {
                double[] erw = new double[PointDimension];

                for (int PointId = 0; PointId < ClusterWithPointsAssociation[ClusterId].length; PointId++) {
                    for (int PointAttribute = 0; PointAttribute < ClusterWithPointsAssociation[ClusterId][PointId].length; PointAttribute++) {
                        erw[PointAttribute] += ClusterWithPointsAssociation[ClusterId][PointId][PointAttribute];
                    }
                }
                for (int j1 = 0; j1 < erw.length; j1++) {
                    erw[j1] = 1.0 / ClusterWithPointsAssociation[ClusterId].length * erw[j1];
                }

                for (int PointId = 0; PointId < ClusterWithPointsAssociation[ClusterId].length; PointId++) {
                    for (int j1 = 0; j1 < ClusterWithPointsAssociation[ClusterId][PointId].length; j1++) {
                        ClusterWithPointsAssociation[ClusterId][PointId][j1] -= erw[j1];
                    }
                }

            }
        }

        // Calculate covariance for each cluster
        for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) {
            if (workClusterSize[ClusterId] > 0) {
                double[][] data = null;
                // use all data instances
                if (this.red == METHOD_COV_MCD || this.cov_sampling == -1
                        || this.cov_sampling > ClusterWithPointsAssociation[ClusterId].length) {
                    data = ClusterWithPointsAssociation[ClusterId];
                }
                // sample data
                else {
                    data = new double[this.cov_sampling][ClusterWithPointsAssociation[ClusterId][0].length];
                    int i = 0;
                    for (Integer index : generator.nextIntSetWithRange(0,
                            ClusterWithPointsAssociation[ClusterId].length, this.cov_sampling)) {
                        data[i] = ClusterWithPointsAssociation[ClusterId][index];
                        i++;
                    }
                }
                // in the case of MCD, do it
                if (this.red == METHOD_COV_MCD) {
                    // we compute h from the normal probability
                    if (this.h == -1)
                        this.h = (int) Math.ceil(this.probability * (float) data.length);
                    CovariancematrixPerCluster[ClusterId] = fastMDC(data, this.h);
                }
                // Regularization and Reduction
                else {
                    if (CovariancematrixPerCluster[ClusterId] == null || rem < this.removeRuns) {
                        boolean change = false;
                        int count = 0;
                        // Reduction Method
                        if (this.red == METHOD_COV_REDUCTION) {
                            do {
                                change = false;
                                int ind = -1;
                                // look for attribute with only one value
                                for (int i = 0; i < data[0].length; i++) {
                                    change = true;
                                    ind = i;
                                    for (int j = 0; j < data.length; j++) {
                                        if (data[j][ind] != data[0][ind]) {
                                            change = false;
                                            ind = -1;
                                            break;
                                        }
                                    }
                                    if (change)
                                        break;
                                }
                                if (change) {
                                    // store which attribute to remove in which cluster
                                    remove[ClusterId][ind + count] = 1;
                                    count++;
                                    double[][] dataNew = new double[data.length][data[0].length - 1];
                                    for (int i = 0; i < data.length; i++) {
                                        System.arraycopy(data[i], 0, dataNew[i], 0, ind);
                                        System.arraycopy(data[i], ind + 1, dataNew[i], ind,
                                                data[0].length - (ind + 1));
                                    }
                                    data = dataNew;
                                }
                            } while (change);

                            // calculate new distancelimit using new number of dimension
                            chi = new ChiSquaredDistributionImpl(data[0].length);
                            try {
                                DistanceLimitPerCluster[ClusterId] = chi
                                        .inverseCumulativeProbability(this.probability);
                            } catch (MathException e) {
                                System.out.println(e);
                            }

                        }
                        CovariancematrixPerCluster[ClusterId] = new CovarianceMatrix(data, numberOfThreads);
                    }
                }
            }
        }

        // REGULARIZATION
        // S is the summarized covariance matrics (QDA)
        double[][] S = null;
        boolean thereisone = false;
        if (this.red == METHOD_COV_REGULARIZE) {
            int id = 0;
            for (boolean b : removed_cluster) {
                if (!b) {
                    thereisone = true;
                    break;
                }
                id++;
            }
            if (!thereisone) {
                throw new OperatorException(
                        "No cluster left. This is a problem. Try not to remove small clusters or reduce number of clusters.");
            }
            S = new double[CovariancematrixPerCluster[id]
                    .getCovMat().length][CovariancematrixPerCluster[id].getCovMat()[0].length];
            for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) {
                if (!removed_cluster[ClusterId] && CovariancematrixPerCluster[ClusterId] != null) {
                    double[][] d = CovariancematrixPerCluster[ClusterId].getCovMat();
                    for (int i = 0; i < d.length; i++) {
                        for (int j = 0; j < d[i].length; j++) {
                            S[i][j] += d[i][j];
                        }
                    }
                }
            }
        }

        // reset Point-association
        if (rem == this.removeRuns) {
            workClusterSize = this.clusterSize.clone();
            workBelongsToCluster = this.belongsToCluster.clone();
        }
        for (int ClusterId = 0; ClusterId < NumberOfCluster; ClusterId++) {
            if (workClusterSize[ClusterId] > 0) {
                Matrix mh = new Matrix(CovariancematrixPerCluster[ClusterId].getCovMat());
                if (this.red == METHOD_COV_REDUCTION && mh.det() == 0) {
                    CovariancematrixPerCluster[ClusterId].addMinimum();
                    mh = new Matrix(CovariancematrixPerCluster[ClusterId].getCovMat());
                } else if (this.red == METHOD_COV_REGULARIZE) {
                    Matrix mS = new Matrix(S);
                    mS = mS.times(this.regularizedLambda / this.points.length);
                    mh = mh.times((1.0 - this.regularizedLambda));
                    mh = mh.plus(mS);
                }
                // This shouldn't happen ...
                if (mh.det() == 0) {
                    CovariancematrixPerCluster[ClusterId].addMinimum();
                    mh = new Matrix(CovariancematrixPerCluster[ClusterId].getCovMat());
                }

                mh = mh.inverse();

                for (int PointId = 0; PointId < points.length; PointId++) {
                    if (workBelongsToCluster[PointId] == ClusterId) {

                        int sum = 0;
                        for (int i : remove[ClusterId])
                            sum += i;

                        double[] point = new double[points[PointId].length - sum];

                        int count = 0;
                        for (int ind = 0; ind < remove[ClusterId].length; ind++) {
                            if (remove[ClusterId][ind] == 1)
                                count++;
                            int newid = ind - count;
                            if (newid < 0)
                                continue;
                            point[newid] = this.points[PointId][newid];
                        }

                        double mahaDist;
                        if (this.red == 0)
                            mahaDist = mahalanobisDistance(point, mh);
                        else
                            mahaDist = mahalanobisDistance(this.points[PointId], mh);
                        result[PointId] = mahaDist / DistanceLimit;

                        // remove association for minimum covariance
                        // determinant
                        if (rem != this.removeRuns && mahaDist > DistanceLimitPerCluster[ClusterId]) {
                            workBelongsToCluster[PointId] = NumberOfCluster;
                            workClusterSize[ClusterId]--;
                        }
                    }
                }
            }
        }
    }

    return result;
}

From source file:org.apache.mahout.freqtermsets.fpgrowth.FPGrowth.java

/**
 * Internal TopKFrequentPattern Generation algorithm, which represents the
 * A's as integers and transforms features to use only integers
 * //  w w w  .jav a 2  s. c om
 * @param transactions
 *          Transaction database Iterator
 * @param attributeFrequency
 *          array representing the Frequency of the corresponding
 *          attribute id
 * @param minSupport
 *          minimum support of the pattern to be mined
 * @param k
 *          Max value of the Size of the Max-Heap in which Patterns are
 *          held
 * @param featureSetSize
 *          number of features
 * @param returnFeatures
 *          the id's of the features for which Top K patterns have to be
 *          mined
 * @param topKPatternsOutputCollector
 *          the outputCollector which transforms the given Pattern in
 *          integer format to the corresponding A Format
 */
private void generateTopKFrequentPatterns(
        // Iterator<Pair<int[], Long>> transactions,
        TransactionTree cTree, OpenObjectIntHashMap<A> attributeIdMapping, long[] attributeFrequency,
        long minSupport, int k, int featureSetSize, Collection<Integer> returnFeatures,
        TopKPatternsOutputConverter<A> topKPatternsOutputCollector, StatusUpdater updater) throws IOException {
    // YA: BONSAAAAAAAII {
    // FPTree tree = new FPTree(featureSetSize);
    FPTree tree = null;
    boolean change = true;
    int pruneIters = 0;
    IntArrayList pruneByContingencyCount = new IntArrayList();
    IntArrayList pruneBySpreadCount = new IntArrayList();

    while (change) {
        pruneByContingencyCount.add(0);
        pruneBySpreadCount.add(0);

        change = false;
        tree = new FPTree(featureSetSize);
        OpenIntLongHashMap[] childJointFreq;
        long[] sumChildSupport;
        if (BONSAI_PRUNE) {
            childJointFreq = new OpenIntLongHashMap[featureSetSize];
            sumChildSupport = new long[featureSetSize];
        }
        double supportGrandTotal = 0;
        // } YA: BONSAAAAAAAII

        for (int i = 0; i < featureSetSize; i++) {
            tree.addHeaderCount(i, attributeFrequency[i]);

            // YA: BONSAAAAAAAII {
            if (attributeFrequency[i] < 0) {
                continue; // this is an attribute not satisfying the
                // monotone constraint
            }
            if (BONSAI_PRUNE) {
                childJointFreq[i] = new OpenIntLongHashMap();
                supportGrandTotal += attributeFrequency[i];
            }
            // } YA: Bonsai
        }

        // Constructing initial FPTree from the list of transactions
        // YA Bonsai : To pass the tree itself the iterator now would work
        // only with ints.. the A type argument is
        // not checked in the constructor. TOD: remove the type argument and
        // force using ints only
        Iterator<Pair<int[], Long>> transactions = new IntTransactionIterator(cTree.iterator(),
                attributeIdMapping);

        int nodecount = 0;
        // int attribcount = 0;
        int i = 0;
        while (transactions.hasNext()) {
            Pair<int[], Long> transaction = transactions.next();
            Arrays.sort(transaction.getFirst());
            // attribcount += transaction.length;
            // YA: Bonsai {
            // nodecount += treeAddCount(tree, transaction.getFirst(),
            // transaction.getSecond(), minSupport, attributeFrequency);
            int temp = FPTree.ROOTNODEID;
            boolean addCountMode = true;
            for (int attribute : transaction.getFirst()) {
                if (attributeFrequency[attribute] < 0) {
                    continue; // this is an attribute not satisfying the
                    // monotone constraint
                }
                if (attributeFrequency[attribute] < minSupport) {
                    break;
                }
                if (BONSAI_PRUNE && tree.attribute(temp) != -1) { // Root node
                    childJointFreq[tree.attribute(temp)].put(attribute,
                            childJointFreq[tree.attribute(temp)].get(attribute) + transaction.getSecond());
                    sumChildSupport[tree.attribute(temp)] += transaction.getSecond();
                }
                int child;
                if (addCountMode) {
                    child = tree.childWithAttribute(temp, attribute);
                    if (child == -1) {
                        addCountMode = false;
                    } else {
                        tree.addCount(child, transaction.getSecond());
                        temp = child;
                    }
                }
                if (!addCountMode) {
                    child = tree.createNode(temp, attribute, transaction.getSecond());
                    temp = child;
                    nodecount++;
                }
            }
            // } YA Bonsai
            i++;
            if (i % 10000 == 0) {
                log.info("FPTree Building: Read {} Transactions", i);
            }
        }

        log.info("Number of Nodes in the FP Tree: {}", nodecount);

        // YA: BONSAAAAAAAII {
        if (BONSAI_PRUNE) {
            if (log.isTraceEnabled())
                log.info("Bonsai prunining tree: {}", tree.toString());

            for (int a = 0; a < tree.getHeaderTableCount(); ++a) {
                int attr = tree.getAttributeAtIndex(a);

                if (attributeFrequency[attr] < 0) {
                    continue; // this is an attribute not satisfying the
                    // monotone constraint
                }
                if (attributeFrequency[attr] < minSupport) {
                    break;
                }
                // if (sumChildSupport[attr] < attributeFrequency[attr]) {
                // // the case of . (full stop) as the next child
                // childJointFreq[attr]
                // .put(-1,
                // (long) (attributeFrequency[attr] - sumChildSupport[attr]));
                // }
                float numChildren = childJointFreq[attr].size();

                // if (numChildren < LEAST_NUM_CHILDREN_TO_VOTE_FOR_NOISE) {
                // continue;
                // }
                if (log.isTraceEnabled()) {
                    log.trace("Voting for noisiness of attribute {} with number of children: {}", attr,
                            numChildren);
                    log.trace("Attribute support: {} - Total Children support: {}", attributeFrequency[attr],
                            sumChildSupport[attr]);
                }
                // EMD and the such.. the threshold isn't easy to define, and it
                // also doesn't take into account the weights of children.
                // // double uniformProb = 1.0 / numChildren;
                // // double uniformProb = sumChildSupport[attr] /
                // supportGrandTotal;
                // double uniformFreq = attributeFrequency[attr] / numChildren;
                // IntArrayList childAttrArr = childJointFreq[attr].keys();
                // // IntArrayList childAttrArr = new IntArrayList();
                // // childJointFreq[attr].keysSortedByValue(childAttrArr);
                // double totalDifference = 0;
                // double sumOfWeights = 0;
                // // double emd = 0;
                // for (int c = childAttrArr.size() - 1; c >=0 ; --c) {
                // int childAttr = childAttrArr.get(c);
                // double childJF = childJointFreq[attr].get(childAttr);
                // double childWeight = attributeFrequency[childAttr];
                // totalDifference += childWeight * Math.abs(childJF -
                // uniformFreq);
                // sumOfWeights += childWeight;
                //
                // // double jointProb = childJF /
                // // supportGrandTotal;
                // // double childProb = attributeFrequency[childAttr] /
                // // supportGrandTotal;
                // // double childConditional = childJF /
                // attributeFrequency[attr];
                // // emd = childConditional + emd - uniformProb;
                // // emd = childJF + emd - uniformFreq;
                // // totalDifference += Math.abs(emd);
                // }
                // // Probability (D > observed ) = QKS Ne + 0.12 + 0.11/ Ne D
                // // double pNotUniform = totalDifference / attrSupport;
                // // double threshold = (numChildren * (numChildren - 1) * 1.0)
                // // / (2.0 * attributeFrequency[attr]);
                // double weightedDiff = totalDifference / sumOfWeights;
                // double threshold = sumOfWeights / 2.0; // each child can be
                // up to
                // // 1 over or below the
                // // uniform freq
                // boolean noise = weightedDiff < threshold;
                // log.info("EMD: {} - Threshold: {}", weightedDiff, threshold);
                // ///////////////////////////////////
                // Log odds.. this is my hartala, and it needs ot be shifted
                // according to the number of children
                // // // if there is one child then the prob of random choice
                // // will be
                // // // 1, so anything would be
                // // // noise
                // // // and if there are few then the probability that this is
                // // // actually noise declines
                // // if (numChildren >= LEAST_NUM_CHILDREN_TO_VOTE_FOR_NOISE)
                // // {
                // // log.info(
                // //
                // "Voting for noisiness of attribute {} with number of children: {}",
                // // currentAttribute, numChildren);
                // // log.info(
                // // "Attribute support: {} - Total Children support: {}",
                // // attrSupport, sumOfChildSupport);
                // // int noiseVotes = 0;
                // // double randomSelectionLogOdds = 1.0 / numChildren;
                // // randomSelectionLogOdds = Math.log(randomSelectionLogOdds
                // // / (1 - randomSelectionLogOdds));
                // // randomSelectionLogOdds =
                // // Math.abs(randomSelectionLogOdds);
                // //
                // // IntArrayList childAttrArr = childJointFreq.keys();
                // // for (int c = 0; c < childAttrArr.size(); ++c) {
                // // double childConditional = 1.0
                // // * childJointFreq.get(childAttrArr.get(c))
                // // / sumOfChildSupport; // attrSupport;
                // // double childLogOdds = Math.log(childConditional
                // // / (1 - childConditional));
                // // if (Math.abs(childLogOdds) <= randomSelectionLogOdds) {
                // // // probability of the child given me is different
                // // // than
                // // // probability of choosing the
                // // // child randomly
                // // // from among my children.. using absolute log odds
                // // // because they are symmetric
                // // ++noiseVotes;
                // // }
                // // }
                // // log.info("Noisy if below: {} - Noise votes: {}",
                // // randomSelectionLogOdds, noiseVotes);
                // // noise = noiseVotes == numChildren;
                // ////////////////////////////////////////////////////

                // // Kullback-liebler divergence from the uniform distribution
                // double randomChild = 1.0 / numChildren;
                // IntArrayList childAttrArr = childJointFreq[attr].keys();
                //
                // double klDivergence = 0;
                // for (int c = 0; c < childAttrArr.size(); ++c) {
                // double childConditional = 1.0
                // * childJointFreq[attr].get(childAttrArr.get(c))
                // / attributeFrequency[attr];
                // if (childConditional == 0) {
                // continue; // a7a!
                // }
                // klDivergence += childConditional
                // * Math.log(childConditional / randomChild);
                // }
                //
                // boolean noise = Math.abs(klDivergence) < 0.05;
                // log.info("KL-Divergence: {} - Noise less than: {}",
                // klDivergence, 0.05);
                // //////////////////////////////////////
                // Pair wise metric with different children
                SummaryStatistics metricSummary = new SummaryStatistics();
                // double[] metric = new double[(int) numChildren];

                // SummaryStatistics spreadSummary = new SummaryStatistics();
                // double uniformSpread = attributeFrequency[attr] /
                // numChildren;
                double goodnessOfFit = 0.0;
                // If I don't take the . into account: sumChildSupport[attr] /
                // numChildren;

                double sumOfWeights = 0;
                IntArrayList childAttrArr = childJointFreq[attr].keys();
                for (int c = 0; c < childAttrArr.size(); ++c) {
                    int childAttr = childAttrArr.get(c);
                    double[][] contingencyTable = new double[2][2];
                    if (childAttr == -1) {
                        // this is meaningless, as yuleq will just be 1
                        contingencyTable[1][1] = childJointFreq[attr].get(childAttr);
                        contingencyTable[1][0] = sumChildSupport[attr];
                        // equals attributeFrequency[attr] -
                        // contingencyTable[1][1];
                        contingencyTable[0][1] = 0;
                        contingencyTable[0][0] = supportGrandTotal - attributeFrequency[attr];
                    } else {
                        contingencyTable[1][1] = childJointFreq[attr].get(childAttr);
                        contingencyTable[1][0] = attributeFrequency[attr] - contingencyTable[1][1];
                        contingencyTable[0][1] = attributeFrequency[childAttr] - contingencyTable[1][1];
                        contingencyTable[0][0] = supportGrandTotal - attributeFrequency[attr]
                                - attributeFrequency[childAttr] + contingencyTable[1][1];
                        // because of the meninglessness of yuleq in case of . }
                        double ad = contingencyTable[0][0] * contingencyTable[1][1];
                        double bc = contingencyTable[0][1] * contingencyTable[1][0];
                        double yuleq = (ad - bc) / (ad + bc);
                        double weight = attributeFrequency[childAttr];
                        sumOfWeights += weight;
                        metricSummary.addValue(Math.abs(yuleq * weight));
                        // metricSummary.addValue(yuleq * yuleq * weight);
                    }
                    // spreadSummary.addValue(Math.abs(uniformSpread
                    // - contingencyTable[1][1])
                    // / numChildren);
                    // spreadSummary.addValue(contingencyTable[1][1]); // *
                    // weight
                    goodnessOfFit += contingencyTable[1][1] * contingencyTable[1][1];
                }
                // double weightedquadraticMean =
                // Math.sqrt(metricSummary.getSum() / sumOfWeights);
                double weightedMean = (metricSummary.getSum() / sumOfWeights);

                boolean noise = false;
                // if (weightedMean < 0.5) {
                // pruneByContingencyCount.set(pruneIters, pruneByContingencyCount.get(pruneIters) + 1);
                // noise = true;
                // } else if (weightedMean < 0.95) {
                if (numChildren > 1) {
                    double n = sumChildSupport[attr]; // attributeFrequency[attr];
                    goodnessOfFit /= (n / numChildren);
                    goodnessOfFit -= n;
                    ChiSquaredDistributionImpl chisqDist = new ChiSquaredDistributionImpl(numChildren - 1);
                    double criticalPoint = -1;
                    try {
                        criticalPoint = chisqDist.inverseCumulativeProbability(1.0 - SIGNIFICANCE / 2.0);
                    } catch (MathException e) {
                        log.error(e.getMessage(), e);
                    }
                    if (goodnessOfFit < criticalPoint) {
                        pruneBySpreadCount.set(pruneIters, pruneBySpreadCount.get(pruneIters) + 1);
                        noise = true;
                    }
                    // // double spreadCentraltendency = (spreadSummary.getMax()
                    // -
                    // // spreadSummary.getMin()) / 2.0;
                    // // spreadSummary.getMean();
                    // // double uniformSpread = sumChildSupport[attr] /
                    // // numChildren;
                    //
                    // // noise = Math.abs(spreadCentraltendency -
                    // uniformSpread) <
                    // // 1e-4;
                    //
                    // double spreadCentraltendency = spreadSummary.getMean();
                    // // (spreadSummary.getMax() -
                    // // spreadSummary.getMin()) / 2.0;
                    // if(spreadCentraltendency < 1e-6){
                    // noise = true;
                    // }
                    //
                    // if (!noise && numChildren > 0) {
                    // // see if the difference is statitically significant
                    // double spreadCI = getConfidenceIntervalHalfWidth(
                    // spreadSummary, SIGNIFICANCE);
                    // spreadCentraltendency -= spreadCI;
                    // if (spreadCentraltendency < 0) {
                    // noise = true;
                    // }
                    // // // noise if the CI contains the uniform spread
                    // // threshold
                    // // if (spreadCentraltendency > uniformSpread) {
                    // // noise = (spreadCentraltendency - spreadCI) <
                    // // uniformSpread;
                    // // } else {
                    // // noise = (spreadCentraltendency + spreadCI) >
                    // // uniformSpread;
                    // // }
                    // }
                }
                change |= noise;

                if (noise) {
                    if (log.isTraceEnabled())
                        log.info("Pruning attribute {} with child joint freq {}", attr, childJointFreq[attr]);
                    returnFeatures.remove(attr);
                    attributeFrequency[attr] = -1;

                }
            }
        }
        ++pruneIters;
    }
    if (log.isTraceEnabled()) {
        log.info("Pruned tree: {}", tree.toString());
        log.info("Prune by contingency: {} - Prune by spread: {}", pruneByContingencyCount.toString(),
                pruneBySpreadCount.toString());
    }
    // } YA: Bonsai
    fpGrowth(tree, minSupport, k, returnFeatures, topKPatternsOutputCollector, updater);
}