Example usage for org.apache.mahout.math WeightedVector getWeight

List of usage examples for org.apache.mahout.math WeightedVector getWeight

Introduction

In this page you can find the example usage for org.apache.mahout.math WeightedVector getWeight.

Prototype

public double getWeight() 

Source Link

Usage

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Clusters the datapoints in the list doing either random seeding of the centroids or k-means++.
 *
 * @param datapoints the points to be clustered.
 * @return an UpdatableSearcher with the resulting clusters.
 *///w w w  .  ja  v  a  2s  .c  o  m
public UpdatableSearcher cluster(List<? extends WeightedVector> datapoints) {
    Pair<List<? extends WeightedVector>, List<? extends WeightedVector>> trainTestSplit = splitTrainTest(
            datapoints);
    List<Vector> bestCentroids = Lists.newArrayList();
    double cost = Double.POSITIVE_INFINITY;
    double bestCost = Double.POSITIVE_INFINITY;
    for (int i = 0; i < numRuns; ++i) {
        centroids.clear();
        if (kMeansPlusPlusInit) {
            // Use k-means++ to set initial centroids.
            initializeSeedsKMeansPlusPlus(trainTestSplit.getFirst());
        } else {
            // Randomly select the initial centroids.
            initializeSeedsRandomly(trainTestSplit.getFirst());
        }
        // Do k-means iterations with trimmed mean computation (aka ball k-means).
        if (numRuns > 1) {
            // If the clustering is successful (there are no zero-weight centroids).
            iterativeAssignment(trainTestSplit.getFirst());
            // Compute the cost of the clustering and possibly save the centroids.
            cost = ClusteringUtils.totalClusterCost(splitTrainTest ? datapoints : trainTestSplit.getSecond(),
                    centroids);
            if (cost < bestCost) {
                bestCost = cost;
                bestCentroids.clear();
                Iterables.addAll(bestCentroids, centroids);
            }
        } else {
            // If there is only going to be one run, the cost doesn't need to be computed, so we just return the clustering.
            iterativeAssignment(datapoints);
            return centroids;
        }
    }
    if (bestCost == Double.POSITIVE_INFINITY) {
        throw new RuntimeException("No valid clustering was found");
    }
    if (cost != bestCost) {
        centroids.clear();
        centroids.addAll(bestCentroids);
    }
    if (correctWeights) {
        for (WeightedVector testDatapoint : trainTestSplit.getSecond()) {
            WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
            closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
        }
    }
    return centroids;
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points randomly with probability proportional to their weights. This is much
 * less sophisticated than the kmeans++ approach, however it is faster and coupled with
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */// w  w  w.java2  s .  c om
private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) {
    int numDatapoints = datapoints.size();
    double totalWeight = 0;
    for (WeightedVector datapoint : datapoints) {
        totalWeight += datapoint.getWeight();
    }
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < numDatapoints; ++i) {
        seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
    }
    for (int i = 0; i < numClusters; ++i) {
        int sample = seedSelector.sample();
        seedSelector.delete(sample);
        Centroid centroid = new Centroid(datapoints.get(sample));
        centroid.setIndex(i);
        centroids.add(centroid);
    }
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points according to the k-means++ algorithm.  The basic idea is that
 * points are selected with probability proportional to their distance from any selected point.  In
 * this version, points have weights which multiply their likelihood of being selected.  This is the
 * same as if there were as many copies of the same point as indicated by the weight.
 *
 * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
 * The basic idea can be made much faster by only processing a random subset of the original points.
 * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
 * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea.  At
 * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
 * implementation.// w w  w  . ja va  2  s. com
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */
private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
    Preconditions.checkArgument(datapoints.size() > 1,
            "Must have at least two datapoints points to cluster " + "sensibly");
    Preconditions.checkArgument(datapoints.size() >= numClusters,
            String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
        center.update(row);
    }

    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double deltaX = 0;
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    for (WeightedVector row : datapoints) {
        deltaX += distanceMeasure.distance(row, center);
    }

    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.

    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
        double selectionProbability = deltaX
                + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
        seedSelector.add(i, selectionProbability);
    }

    int selected = random.nextInt(datapoints.size());
    Centroid c_1 = new Centroid(datapoints.get(selected).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
        WeightedVector row = datapoints.get(i);
        double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
        seedSelector.set(i, w);
    }

    // From here, seeds are selected with probability proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
        // Select according to weights.
        int seedIndex = seedSelector.sample();
        Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
        nextSeed.setIndex(clusterIndex++);
        centroids.add(nextSeed);
        // Don't select this one again.
        seedSelector.delete(seedIndex);
        // Re-weight everything according to the minimum distance to a seed.
        for (int currSeedIndex : seedSelector) {
            WeightedVector curr = datapoints.get(currSeedIndex);
            double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
            if (newWeight < seedSelector.getWeight(currSeedIndex)) {
                seedSelector.set(currSeedIndex, newWeight);
            }
        }
    }
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points.  To
 * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction
 * where d_i is//from  w  w w  .jav  a 2  s  . com
 *
 * d_i = min_j \sqrt ||c_j - c_i||^2
 *
 * By ignoring distant points, the centroids converge more quickly to a good approximation of the
 * optimal k-means solution (given good starting points).
 *
 * @param datapoints the points to cluster.
 */
private void iterativeAssignment(List<? extends WeightedVector> datapoints) {
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest
    // neighboring cluster.
    List<Double> closestClusterDistances = Lists.newArrayListWithExpectedSize(numClusters);
    // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When
    // these don't change, we are done.
    // Each point is assigned to the invalid "-1" cluster initially.
    List<Integer> clusterAssignments = Lists.newArrayList(Collections.nCopies(datapoints.size(), -1));

    boolean changed = true;
    for (int i = 0; changed && i < maxNumIterations; i++) {
        changed = false;
        // We compute what the distance between each cluster and its closest neighbor is to set a
        // proportional distance threshold for points that should be involved in calculating the
        // centroid.
        closestClusterDistances.clear();

        for (Vector center : centroids) {
            // If a centroid has no points assigned to it, the clustering failed.

            Vector closestOtherCluster = centroids.searchFirst(center, true).getValue();
            closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster));
        }

        // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
        // so we calculate the new centroids as we go through the datapoints.
        List<Centroid> newCentroids = Lists.newArrayList();
        for (Vector centroid : centroids) {
            // need a deep copy because we will mutate these values
            Centroid newCentroid = (Centroid) centroid.clone();
            newCentroid.setWeight(0);
            newCentroids.add(newCentroid);
        }

        // Pass over the datapoints computing new centroids.
        for (int j = 0; j < datapoints.size(); ++j) {
            WeightedVector datapoint = datapoints.get(j);
            // Get the closest cluster this point belongs to.
            WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
            int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
            double closestDistance = closestPair.getWeight();
            // Update its cluster assignment if necessary.
            if (closestIndex != clusterAssignments.get(j)) {
                changed = true;
                clusterAssignments.set(j, closestIndex);
            }
            // Only update if the datapoints point is near enough. What this means is that the weight
            // of outliers is NOT taken into account and the final weights of the centroids will
            // reflect this (it will be less or equal to the initial sum of the weights).
            if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) {
                newCentroids.get(closestIndex).update(datapoint);
            }
        }
        // Add the new centers back into searcher.
        centroids.clear();
        centroids.addAll(newCentroids);
    }

    if (correctWeights) {
        for (Vector v : centroids) {
            ((Centroid) v).setWeight(0);
        }
        for (WeightedVector datapoint : datapoints) {
            Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
            closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
        }
    }
}

From source file:io.ssc.relationdiscovery.SVD.java

License:Open Source License

public Matrix projectRowsOntoFeatureSpace() {

    SparseRowMatrix projection = new SparseRowMatrix(A.numRows(), rank);

    for (int patternIndex = 0; patternIndex < A.numRows(); patternIndex++) {

        Vector patternOccurrences = A.viewRow(patternIndex);

        for (int r = 0; r < rank; r++) {
            WeightedVector singularVector = singularVectors.get(r);
            double weight = singularVector.getWeight() * patternOccurrences.dot(singularVector);
            projection.setQuick(patternIndex, r, weight);
        }/*from  w  w w .  j  a v a2 s  . com*/
    }
    return projection;
}

From source file:org.mpei.knn.lsh.tools.HashedVector.java

License:Apache License

public HashedVector(WeightedVector v, Matrix projection, long mask) {
    super(v.getVector(), v.getWeight(), v.getIndex());
    this.hash = mask;
}