org.iobserve.analysis.behavior.clustering.hierarchical.GapStatisticMethod.java Source code

Java tutorial

Introduction

Here is the source code for org.iobserve.analysis.behavior.clustering.hierarchical.GapStatisticMethod.java

Source

/***************************************************************************
 * Copyright (C) 2018 iObserve Project (https://www.iobserve-devops.net)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ***************************************************************************/

package org.iobserve.analysis.behavior.clustering.hierarchical;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadLocalRandom;

import org.eclipse.net4j.util.collection.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import weka.clusterers.HierarchicalClusterer;
import weka.core.DistanceFunction;
import weka.core.Instance;
import weka.core.Instances;

/**
 * Use Gap Statistic Method for selecting a "good" amount of clusters for a given input data set,
 * and cluster this data accordingly.
 *
 * @author SL
 * @since 0.0.3
 */
public class GapStatisticMethod implements IClusterSelectionMethods {

    private static final Logger LOGGER = LoggerFactory.getLogger(HierarchicalClusterer.class);
    private static double numRefDataSets = 3;

    private final HierarchicalClusterer hierarchicalClusterer;
    private final Instances instances;
    private List<Double> minAttributeValues;
    private List<Double> maxAttributeValues;

    /**
     * Constructor.
     *
     * @param hierarchicalClusterer
     *            Clusterer that performs hierarchical clustering.
     * @param instances
     *            Input data.
     */
    public GapStatisticMethod(final HierarchicalClusterer hierarchicalClusterer, final Instances instances) {
        this.hierarchicalClusterer = hierarchicalClusterer;
        this.instances = instances;
    }

    @Override
    public Map<Integer, List<Pair<Instance, Double>>> analyze() {

        GapStatisticMethod.LOGGER.info("Starting GapStatisticMethod");
        Map<Integer, List<Pair<Instance, Double>>> clusteringResults = new HashMap<>(); // NOPMD
        if (this.instances != null) {

            // Calculate sum of square error for each number of clusters.
            final int maxNumberOfClusters = this.instances.numInstances();
            final List<Double> essList = this.calcESSList(this.instances, maxNumberOfClusters);

            /*
             * Generate reference data sets with a random uniform distribution. For that, find min
             * and max values for each attribute which will be the boundaries of possible values.
             */
            // Find min and max boundary values for each attribute of all instances.
            this.findBoundaryValues();
            /*
             * Create numRefDataSets-many data sets with new instances with random attribute values
             * in range of the boundary values.
             */
            final List<Instances> uniformDistrDatasets = new ArrayList<>();
            // Contains for every uniform distributed data set a ESS list.
            final List<List<Double>> essListForEachUniformDataset = new ArrayList<>();
            for (int i = 0; i < GapStatisticMethod.numRefDataSets; i++) {
                final Instances uniformDistrDataSet = this.createRandomInstancesWithBoundaries();
                uniformDistrDatasets.add(uniformDistrDataSet);
                essListForEachUniformDataset.add(this.calcESSList(uniformDistrDataSet, maxNumberOfClusters));
            }

            // Compute the estimated gap statistic.
            clusteringResults = this.gapStatistics(essList, essListForEachUniformDataset);
        }
        GapStatisticMethod.LOGGER.info("GapStatisticMethod done.");

        return clusteringResults;
    }

    /**
     * Calculate the Gap Statistic for each number of clusters.
     *
     * @param essList
     *            List with sum of square errors for each clustering of the input data.
     * @param essListReference
     *            List with sum of square errors for each clustering of the reference data.
     * @return Clustering result of input data with a "good" amout of clusters.
     */
    private Map<Integer, List<Pair<Instance, Double>>> gapStatistics(final List<Double> essList,
            final List<List<Double>> essListForEachUniformDataset) {

        final List<Double> gapStatistics = new ArrayList<>();
        // Calculate log of each ESS for calculating the Gap Statistic.
        final List<Double> logESS = new ArrayList<>();
        for (final Double entry : essList) {
            if (entry == 0.0) {
                logESS.add(0.0);
            } else {
                logESS.add(Math.log(entry));
            }
        }
        final List<List<Double>> logESSRef = new ArrayList<>();
        for (final List<Double> essListUniform : essListForEachUniformDataset) {
            final List<Double> hlpList = new ArrayList<>();
            for (final Double entry : essListUniform) {
                hlpList.add(Math.log(entry));
            }
            logESSRef.add(hlpList);
        }

        // Calculate the Gap Statistic for each number of clusters.
        final List<Double> wkbs = new ArrayList<>();
        for (int i = 0; i < essList.size(); i++) {
            // within-dispersion measure of ref data for a spesific number of clusters.
            double wkb = 0.0;
            for (final List<Double> l : logESSRef) {
                wkb += l.get(i); // here, i is the number of clusters.
            }
            wkb /= GapStatisticMethod.numRefDataSets;
            wkbs.add(wkb);
            gapStatistics.add(wkb - logESS.get(i));
        }

        // Calculate the standard deviation.
        final List<Double> sks = this.calcStandardDeviation(logESSRef, wkbs);

        // Find good number of clusters.
        final int goodNumberOfClusters = this.findGoodNumberOfClusters(gapStatistics, sks);

        // Cluster instances with the "good" number of clusters.
        Map<Integer, List<Pair<Instance, Double>>> clusteringResults = new HashMap<>();
        try {
            this.hierarchicalClusterer.setNumClusters(goodNumberOfClusters);
            this.hierarchicalClusterer.buildClusterer(this.instances);
            clusteringResults = ClusteringResultsBuilder.buildClusteringResults(this.instances,
                    this.hierarchicalClusterer);
        } catch (final Exception e) { // NOPMD NOCS api dependency
            GapStatisticMethod.LOGGER.error("Clustering at ElbowMethod failed.", e);
        }

        return clusteringResults;
    }

    /**
     * Calculate the standard deviation of a list of values.
     *
     * @param bWkbs
     *            Total within-cluster sum of squares of reference data.
     * @param wkbs
     *            Total within-cluster sum of squares of actual user behavior data.
     * @return List of standard deviations for each number of clusters.
     */
    private List<Double> calcStandardDeviation(final List<List<Double>> bWkbs, final List<Double> wkbs) {
        final List<Double> sks = new ArrayList<>();

        for (int i = 0; i < wkbs.size(); i++) {
            double res = 0.0;

            for (final List<Double> w : bWkbs) {
                double hlp = 0.0;
                hlp = w.get(i) - wkbs.get(i);
                hlp *= hlp;
                res += hlp;
            }
            res /= GapStatisticMethod.numRefDataSets;
            final double sd = Math.sqrt(res);
            final double sk = sd * Math.sqrt(1 + (1 / GapStatisticMethod.numRefDataSets));
            sks.add(sk);
        }

        return sks;
    }

    /**
     * Find the number of clusters with the biggest gap within standard deviation.
     *
     * @param gapStatistics
     *            List of gap statistic for each number of clusters.
     * @param sks
     *            List of standard deviation for each number of clusters.
     * @return goodNumberOfClusters
     */
    private int findGoodNumberOfClusters(final List<Double> gapStatistics, final List<Double> sks) {
        int goodNumberOfClusters = 1;

        for (int k = 0; k < (gapStatistics.size() - 1); k++) {
            final double gapK = gapStatistics.get(k);
            final double gapKPlus1 = gapStatistics.get(k + 1);

            if (gapK >= (gapKPlus1 - sks.get(k + 1))) {
                goodNumberOfClusters = k + 1;
                break;
            }
        }
        return goodNumberOfClusters;
    }

    /**
     * Calculate the sum-of-square error for each each number of clusters.
     *
     * @param instanceList
     *            Instances which are to be clustered.
     * @param maxNumberOfClusters
     *            Max cluster number.
     * @return i-th entry of the list contains the sum-of-square error for i clusters.
     */
    private List<Double> calcESSList(final Instances instanceList, final int maxNumberOfClusters) {
        final List<Double> ess = new ArrayList<>();
        for (int i = 1; i <= maxNumberOfClusters; i++) {

            try {
                final int numberOfClusters = i;
                this.hierarchicalClusterer.setNumClusters(numberOfClusters);
                this.hierarchicalClusterer.buildClusterer(instanceList);

                // Get the assignments of each data point to the clusters.
                final List<ArrayList<Integer>> assignments = this.buildClusterAssignmentsList(numberOfClusters);

                // Calculate the error sum-of-squares for each cluster.
                double s = 0.0;
                for (int cluster = 0; cluster < numberOfClusters; cluster++) {
                    if (!assignments.get(cluster).isEmpty()) {
                        s += this.calcESS(assignments.get(cluster));
                    }
                }
                ess.add(i - 1, s);

            } catch (final Exception e) { // NOPMD NOCS api dependency
                GapStatisticMethod.LOGGER.error("Hierarchical clustering failed.", e);
            }
        }
        return ess;
    }

    /**
     * Creates Instances and each instance has attribute values in the range of the boundary values
     * minAttributeValues and maxAttributeValues.
     *
     * @return Instances with random uniform distribution in range of boundary values.
     */
    private Instances createRandomInstancesWithBoundaries() {
        final Instances randomInstances = new Instances(this.instances);
        for (int i = 0; i < randomInstances.numInstances(); i++) {
            for (int j = 0; j < randomInstances.numAttributes(); j++) {
                final double minAttValue = this.minAttributeValues.get(j);
                final double maxAttValue = this.maxAttributeValues.get(j);
                final long randomAttValue = ThreadLocalRandom.current().nextLong((long) minAttValue,
                        (long) maxAttValue + 1);
                randomInstances.instance(i).setValue(j, randomAttValue);
            }
        }
        return randomInstances;
    }

    /**
     * Finds min and max values for each attribute of the instances.
     */
    private void findBoundaryValues() {
        final List<Double> minBoundaryValues = new ArrayList<>();
        final List<Double> maxBoundaryValues = new ArrayList<>();

        for (int i = 0; i < this.instances.numInstances(); i++) {
            final Instance instance = this.instances.instance(i);
            for (int j = 0; j < instance.numAttributes(); j++) {
                // Initialization of both min and max lists.
                if (minBoundaryValues.size() == j) {
                    minBoundaryValues.add(j, Double.POSITIVE_INFINITY);
                }
                if (maxBoundaryValues.size() == j) {
                    maxBoundaryValues.add(j, Double.NEGATIVE_INFINITY);
                }
                final double minValue = minBoundaryValues.get(j);
                final double maxValue = maxBoundaryValues.get(j);
                final double attributeValue = instance.value(j);
                if (attributeValue < minValue) {
                    minBoundaryValues.remove(j);
                    minBoundaryValues.add(j, attributeValue);
                }
                if (attributeValue >= maxValue) {
                    maxBoundaryValues.remove(j);
                    maxBoundaryValues.add(j, attributeValue);
                }
            }
        }
        this.setMinAttributeValues(minBoundaryValues);
        this.setMaxAttributeValues(maxBoundaryValues);
    }

    // From HierarchicalClusterer
    /**
     * Calculated error sum-of-squares (ESS) for a given cluster.
     *
     * @param cluster
     *            Calculate the ESS for this cluster
     * @return ESS
     **/
    public double calcESS(final List<Integer> cluster) {

        if ((cluster.size() == 0) || (cluster.size() == 1)) {
            return 0.0;
        }
        final DistanceFunction distanceFunction = this.hierarchicalClusterer.getDistanceFunction();
        final double[] sumAttValues = new double[this.instances.numAttributes()];
        for (int i = 0; i < cluster.size(); i++) {
            final Instance instance = this.instances.instance(cluster.get(i));
            // Sum up all values of all instances.
            for (int j = 0; j < this.instances.numAttributes(); j++) {
                sumAttValues[j] += instance.value(j);
            }
        }
        // Get average value of each attribute value.
        for (int j = 0; j < this.instances.numAttributes(); j++) {
            sumAttValues[j] /= cluster.size();
        }

        /*
         * Create a centroid of this cluster by setting the average attributes of this cluster as
         * its own.
         */
        final Instance centroid = (Instance) this.instances.instance(cluster.get(0)).copy();
        for (int j = 0; j < this.instances.numAttributes(); j++) {
            centroid.setValue(j, sumAttValues[j]);
        }
        // Sum up distances of each data point in cluster to centroid to get ESS.
        double clusterESS = 0.0;
        for (int i = 0; i < cluster.size(); i++) {
            final Instance instance = this.instances.instance(cluster.get(i));
            clusterESS += distanceFunction.distance(centroid, instance);
        }
        return clusterESS / cluster.size();
    }

    /**
     * Returns a list which i-th entry is the number of the assigned cluster of the i-th instance.
     *
     * @param numberOfClusters
     *            number of total clusters of this clustering
     * @return List that contains Clusters and their assigned instances.
     */
    private List<ArrayList<Integer>> buildClusterAssignmentsList(final int numberOfClusters) {
        // Assignments of each data point the clusters.
        final List<ArrayList<Integer>> assignments = new ArrayList<>();
        // Initialize assignments with empty vectors.
        for (int j = 0; j < numberOfClusters; j++) {
            assignments.add(j, new ArrayList<>());
        }

        for (int s = 0; s < this.instances.numInstances(); s++) {
            try {
                final int assignedCluster = this.hierarchicalClusterer.clusterInstance(this.instances.instance(s));
                assignments.get(assignedCluster).add(s);
            } catch (final Exception e) { // NOPMD NOCS api dependency
                GapStatisticMethod.LOGGER.error("Clustering at GapStatisticMethod failed.", e);
            }
        }
        return assignments;
    }

    /**
     * Builds a String of the instance-to-cluster assignments for debugging.
     *
     * @param assignments
     *            List of clusters and their assigned instances.
     */
    private void printAssignmentString(final List<ArrayList<Integer>> assignments) {
        String assignmentString = "";
        for (final List<Integer> v : assignments) {
            assignmentString += v.toString();
        }
        GapStatisticMethod.LOGGER.info("Assignments: " + assignmentString + "\n");
    }

    private void setMinAttributeValues(final List<Double> minAttributeValues) {
        this.minAttributeValues = minAttributeValues;
    }

    private void setMaxAttributeValues(final List<Double> maxAttributeValues) {
        this.maxAttributeValues = maxAttributeValues;
    }
}