Example usage for org.apache.mahout.math WeightedVector setWeight

Introduction

In this page you can find the example usage for org.apache.mahout.math WeightedVector setWeight.

Prototype

public void setWeight(double newWeight)

Source Link

Usage

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++.
 *
 * @param datapoints the points to be clustered.
 * @return an UpdatableSearcher with the resulting clusters.
 *///from   w  ww .j  a v a2 s  .co  m
public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) {
    Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest(
            datapoints);
    List<Vector> bestCentroids = Lists.newArrayList();
    double cost = Double.POSITIVE_INFINITY;
    double bestCost = Double.POSITIVE_INFINITY;
    for (int i = 0; i < numRuns; ++i) {
        centroids.clear();
        if (kMeansPlusPlusInit) {
            // Use k-means++ to set initial centroids.
            initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst());
        } else {
            // Randomly select the initial centroids.
            initializeSeedsRandomly(trainTestSplit.getFirst());
        }
        // Do k-means iterations with trimmed mean computation (aka ball k-means).
        if (numRuns > 1) {
            // If the clustering is successful (there are no zero-weight centroids).
            iterativeAssignment(trainTestSplit.getFirst());
            // Compute the cost of the clustering and possibly save the centroids.
            cost = ClusteringUtils.totalClusterCost(splitTrainTest ? datapoints : trainTestSplit.getSecond(),
                    centroids);
            if (cost < bestCost) {
                bestCost = cost;
                bestCentroids.clear();
                Iterables.addAll(bestCentroids, centroids);
            }
        } else {
            // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering.
            iterativeAssignment(datapoints);
            return centroids;
        }
    }
    if (bestCost == Double.POSITIVE_INFINITY) {
        throw new RuntimeException("No valid clustering was found");
    }
    if (cost != bestCost) {
        centroids.clear();
        centroids.addAll(bestCentroids);
    }
    if (correctWeights) {
        for (WeightedVector testDatapoint : trainTestSplit.getSecond()) {
            WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
            closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
        }
    }
    return centroids;
}