Example usage for org.apache.mahout.clustering ClusteringUtils totalClusterCost

List of usage examples for org.apache.mahout.clustering ClusteringUtils totalClusterCost

Introduction

In this page you can find the example usage for org.apache.mahout.clustering ClusteringUtils totalClusterCost.

Prototype

public static double totalClusterCost(Iterable<? extends Vector> datapoints, Searcher centroids) 

Source Link

Document

Adds up the distances from each point to its closest cluster and returns the sum.

Usage

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++.
 *
 * @param datapoints the points to be clustered.
 * @return an UpdatableSearcher with the resulting clusters.
 *//*from  ww  w  . j  a  v  a  2s  .  co m*/
public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) {
    Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest(
            datapoints);
    List<Vector> bestCentroids = Lists.newArrayList();
    double cost = Double.POSITIVE_INFINITY;
    double bestCost = Double.POSITIVE_INFINITY;
    for (int i = 0; i < numRuns; ++i) {
        centroids.clear();
        if (kMeansPlusPlusInit) {
            // Use k-means++ to set initial centroids.
            initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst());
        } else {
            // Randomly select the initial centroids.
            initializeSeedsRandomly(trainTestSplit.getFirst());
        }
        // Do k-means iterations with trimmed mean computation (aka ball k-means).
        if (numRuns > 1) {
            // If the clustering is successful (there are no zero-weight centroids).
            iterativeAssignment(trainTestSplit.getFirst());
            // Compute the cost of the clustering and possibly save the centroids.
            cost = ClusteringUtils.totalClusterCost(splitTrainTest ? datapoints : trainTestSplit.getSecond(),
                    centroids);
            if (cost < bestCost) {
                bestCost = cost;
                bestCentroids.clear();
                Iterables.addAll(bestCentroids, centroids);
            }
        } else {
            // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering.
            iterativeAssignment(datapoints);
            return centroids;
        }
    }
    if (bestCost == Double.POSITIVE_INFINITY) {
        throw new RuntimeException("No valid clustering was found");
    }
    if (cost != bestCost) {
        centroids.clear();
        centroids.addAll(bestCentroids);
    }
    if (correctWeights) {
        for (WeightedVector testDatapoint : trainTestSplit.getSecond()) {
            WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
            closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
        }
    }
    return centroids;
}