List of usage examples for org.apache.mahout.math.random WeightedThing getValue
public T getValue()
From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java
License:Apache License
/** * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points. To * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction * where d_i is//from ww w.jav a 2s.c o m * * d_i = min_j \sqrt ||c_j - c_i||^2 * * By ignoring distant points, the centroids converge more quickly to a good approximation of the * optimal k-means solution (given good starting points). * * @param datapoints the points to cluster. */ private void iterativeAssignment(List<? extends WeightedVector> datapoints) { DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest // neighboring cluster. List<Double> closestClusterDistances = Lists.newArrayListWithExpectedSize(numClusters); // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When // these don't change, we are done. // Each point is assigned to the invalid "-1" cluster initially. List<Integer> clusterAssignments = Lists.newArrayList(Collections.nCopies(datapoints.size(), -1)); boolean changed = true; for (int i = 0; changed && i < maxNumIterations; i++) { changed = false; // We compute what the distance between each cluster and its closest neighbor is to set a // proportional distance threshold for points that should be involved in calculating the // centroid. closestClusterDistances.clear(); for (Vector center : centroids) { // If a centroid has no points assigned to it, the clustering failed. Vector closestOtherCluster = centroids.searchFirst(center, true).getValue(); closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster)); } // Copies the current cluster centroids to newClusters and sets their weights to 0. This is // so we calculate the new centroids as we go through the datapoints. List<Centroid> newCentroids = Lists.newArrayList(); for (Vector centroid : centroids) { // need a deep copy because we will mutate these values Centroid newCentroid = (Centroid) centroid.clone(); newCentroid.setWeight(0); newCentroids.add(newCentroid); } // Pass over the datapoints computing new centroids. for (int j = 0; j < datapoints.size(); ++j) { WeightedVector datapoint = datapoints.get(j); // Get the closest cluster this point belongs to. WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false); int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex(); double closestDistance = closestPair.getWeight(); // Update its cluster assignment if necessary. if (closestIndex != clusterAssignments.get(j)) { changed = true; clusterAssignments.set(j, closestIndex); } // Only update if the datapoints point is near enough. What this means is that the weight // of outliers is NOT taken into account and the final weights of the centroids will // reflect this (it will be less or equal to the initial sum of the weights). if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) { newCentroids.get(closestIndex).update(datapoint); } } // Add the new centers back into searcher. centroids.clear(); centroids.addAll(newCentroids); } if (correctWeights) { for (Vector v : centroids) { ((Centroid) v).setWeight(0); } for (WeightedVector datapoint : datapoints) { Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue(); closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight()); } } }