Example usage for org.apache.mahout.clustering ClusteringUtils totalClusterCost

Introduction

In this page you can find the example usage for org.apache.mahout.clustering ClusteringUtils totalClusterCost.

Prototype

public static double totalClusterCost(Iterable<? extends Vector> datapoints, Searcher centroids)

Source Link

Document

Adds up the distances from each point to its closest cluster and returns the sum.

Usage

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++.
 *
 * @param datapoints the points to be clustered.
 * @return an UpdatableSearcher with the resulting clusters.
 *//*from  ww  w  . j  a  v  a  2s  .  co m*/
public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) {
    Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest(
            datapoints);
    List<Vector> bestCentroids = Lists.newArrayList();
    double cost = Double.POSITIVE_INFINITY;
    double bestCost = Double.POSITIVE_INFINITY;
    for (int i = 0; i < numRuns; ++i) {
        centroids.clear();
        if (kMeansPlusPlusInit) {
            // Use k-means++ to set initial centroids.
            initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst());
        } else {
            // Randomly select the initial centroids.
            initializeSeedsRandomly(trainTestSplit.getFirst());
        }
        // Do k-means iterations with trimmed mean computation (aka ball k-means).
        if (numRuns > 1) {
            // If the clustering is successful (there are no zero-weight centroids).
            iterativeAssignment(trainTestSplit.getFirst());
            // Compute the cost of the clustering and possibly save the centroids.
            cost = ClusteringUtils.totalClusterCost(splitTrainTest ? datapoints : trainTestSplit.getSecond(),
                    centroids);
            if (cost < bestCost) {
                bestCost = cost;
                bestCentroids.clear();
                Iterables.addAll(bestCentroids, centroids);
            }
        } else {
            // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering.
            iterativeAssignment(datapoints);
            return centroids;
        }
    }
    if (bestCost == Double.POSITIVE_INFINITY) {
        throw new RuntimeException("No valid clustering was found");
    }
    if (cost != bestCost) {
        centroids.clear();
        centroids.addAll(bestCentroids);
    }
    if (correctWeights) {
        for (WeightedVector testDatapoint : trainTestSplit.getSecond()) {
            WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
            closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
        }
    }
    return centroids;
}