List of usage examples for org.apache.mahout.math.random WeightedThing getWeight
public double getWeight()
From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java
License:Apache License
/** * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points. To * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction * where d_i is/*from w ww .j a va 2 s.c om*/ * * d_i = min_j \sqrt ||c_j - c_i||^2 * * By ignoring distant points, the centroids converge more quickly to a good approximation of the * optimal k-means solution (given good starting points). * * @param datapoints the points to cluster. */ private void iterativeAssignment(List<? extends WeightedVector> datapoints) { DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest // neighboring cluster. List<Double> closestClusterDistances = Lists.newArrayListWithExpectedSize(numClusters); // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When // these don't change, we are done. // Each point is assigned to the invalid "-1" cluster initially. List<Integer> clusterAssignments = Lists.newArrayList(Collections.nCopies(datapoints.size(), -1)); boolean changed = true; for (int i = 0; changed && i < maxNumIterations; i++) { changed = false; // We compute what the distance between each cluster and its closest neighbor is to set a // proportional distance threshold for points that should be involved in calculating the // centroid. closestClusterDistances.clear(); for (Vector center : centroids) { // If a centroid has no points assigned to it, the clustering failed. Vector closestOtherCluster = centroids.searchFirst(center, true).getValue(); closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster)); } // Copies the current cluster centroids to newClusters and sets their weights to 0. This is // so we calculate the new centroids as we go through the datapoints. List<Centroid> newCentroids = Lists.newArrayList(); for (Vector centroid : centroids) { // need a deep copy because we will mutate these values Centroid newCentroid = (Centroid) centroid.clone(); newCentroid.setWeight(0); newCentroids.add(newCentroid); } // Pass over the datapoints computing new centroids. for (int j = 0; j < datapoints.size(); ++j) { WeightedVector datapoint = datapoints.get(j); // Get the closest cluster this point belongs to. WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false); int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex(); double closestDistance = closestPair.getWeight(); // Update its cluster assignment if necessary. if (closestIndex != clusterAssignments.get(j)) { changed = true; clusterAssignments.set(j, closestIndex); } // Only update if the datapoints point is near enough. What this means is that the weight // of outliers is NOT taken into account and the final weights of the centroids will // reflect this (it will be less or equal to the initial sum of the weights). if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) { newCentroids.get(closestIndex).update(datapoint); } } // Add the new centers back into searcher. centroids.clear(); centroids.addAll(newCentroids); } if (correctWeights) { for (Vector v : centroids) { ((Centroid) v).setWeight(0); } for (WeightedVector datapoint : datapoints) { Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue(); closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight()); } } }
From source file:zx.soft.mahout.knn.search.AbstractSearchTest.java
License:Apache License
@Test public void testOrdering() { Matrix queries = new DenseMatrix(100, 20); MultiNormal gen = new MultiNormal(20); for (int i = 0; i < 100; i++) { queries.viewRow(i).assign(gen.sample()); }/*from w w w . ja v a2s . co m*/ Searcher s = getSearch(20); // s.setSearchSize(200); s.addAllMatrixSlices(testData()); for (MatrixSlice query : queries) { List<WeightedThing<Vector>> r = s.search(query.vector(), 200); double x = 0; for (WeightedThing<Vector> thing : r) { assertTrue("Scores must be monotonic increasing", thing.getWeight() > x); x = thing.getWeight(); } } }