Example usage for org.apache.mahout.math.random WeightedThing getValue

Introduction

In this page you can find the example usage for org.apache.mahout.math.random WeightedThing getValue.

Prototype

public T getValue()

Source Link

Usage

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points.  To
 * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction
 * where d_i is//from ww w.jav  a  2s.c o m
 *
 * d_i = min_j \sqrt ||c_j - c_i||^2
 *
 * By ignoring distant points, the centroids converge more quickly to a good approximation of the
 * optimal k-means solution (given good starting points).
 *
 * @param datapoints the points to cluster.
 */
private void iterativeAssignment(List<? extends WeightedVector> datapoints) {
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest
    // neighboring cluster.
    List<Double> closestClusterDistances = Lists.newArrayListWithExpectedSize(numClusters);
    // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When
    // these don't change, we are done.
    // Each point is assigned to the invalid "-1" cluster initially.
    List<Integer> clusterAssignments = Lists.newArrayList(Collections.nCopies(datapoints.size(), -1));

    boolean changed = true;
    for (int i = 0; changed && i < maxNumIterations; i++) {
        changed = false;
        // We compute what the distance between each cluster and its closest neighbor is to set a
        // proportional distance threshold for points that should be involved in calculating the
        // centroid.
        closestClusterDistances.clear();

        for (Vector center : centroids) {
            // If a centroid has no points assigned to it, the clustering failed.

            Vector closestOtherCluster = centroids.searchFirst(center, true).getValue();
            closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster));
        }

        // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
        // so we calculate the new centroids as we go through the datapoints.
        List<Centroid> newCentroids = Lists.newArrayList();
        for (Vector centroid : centroids) {
            // need a deep copy because we will mutate these values
            Centroid newCentroid = (Centroid) centroid.clone();
            newCentroid.setWeight(0);
            newCentroids.add(newCentroid);
        }

        // Pass over the datapoints computing new centroids.
        for (int j = 0; j < datapoints.size(); ++j) {
            WeightedVector datapoint = datapoints.get(j);
            // Get the closest cluster this point belongs to.
            WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
            int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
            double closestDistance = closestPair.getWeight();
            // Update its cluster assignment if necessary.
            if (closestIndex != clusterAssignments.get(j)) {
                changed = true;
                clusterAssignments.set(j, closestIndex);
            }
            // Only update if the datapoints point is near enough. What this means is that the weight
            // of outliers is NOT taken into account and the final weights of the centroids will
            // reflect this (it will be less or equal to the initial sum of the weights).
            if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) {
                newCentroids.get(closestIndex).update(datapoint);
            }
        }
        // Add the new centers back into searcher.
        centroids.clear();
        centroids.addAll(newCentroids);
    }

    if (correctWeights) {
        for (Vector v : centroids) {
            ((Centroid) v).setWeight(0);
        }
        for (WeightedVector datapoint : datapoints) {
            Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
            closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
        }
    }
}