List of usage examples for org.apache.mahout.clustering ClusteringUtils totalClusterCost
public static double totalClusterCost(Iterable<? extends Vector> datapoints, Searcher centroids)
From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java
License:Apache License
/** * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++. * * @param datapoints the points to be clustered. * @return an UpdatableSearcher with the resulting clusters. *//*from ww w . j a v a 2s . co m*/ public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) { Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest( datapoints); List<Vector> bestCentroids = Lists.newArrayList(); double cost = Double.POSITIVE_INFINITY; double bestCost = Double.POSITIVE_INFINITY; for (int i = 0; i < numRuns; ++i) { centroids.clear(); if (kMeansPlusPlusInit) { // Use k-means++ to set initial centroids. initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst()); } else { // Randomly select the initial centroids. initializeSeedsRandomly(trainTestSplit.getFirst()); } // Do k-means iterations with trimmed mean computation (aka ball k-means). if (numRuns > 1) { // If the clustering is successful (there are no zero-weight centroids). iterativeAssignment(trainTestSplit.getFirst()); // Compute the cost of the clustering and possibly save the centroids. cost = ClusteringUtils.totalClusterCost(splitTrainTest ? datapoints : trainTestSplit.getSecond(), centroids); if (cost < bestCost) { bestCost = cost; bestCentroids.clear(); Iterables.addAll(bestCentroids, centroids); } } else { // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering. iterativeAssignment(datapoints); return centroids; } } if (bestCost == Double.POSITIVE_INFINITY) { throw new RuntimeException("No valid clustering was found"); } if (cost != bestCost) { centroids.clear(); centroids.addAll(bestCentroids); } if (correctWeights) { for (WeightedVector testDatapoint : trainTestSplit.getSecond()) { WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue(); closest.setWeight(closest.getWeight() + testDatapoint.getWeight()); } } return centroids; }