Example usage for org.apache.mahout.common.distance DistanceMeasure distance

Introduction

In this page you can find the example usage for org.apache.mahout.common.distance DistanceMeasure distance.

Prototype

double distance(Vector v1, Vector v2);

Source Link

Document

Returns the distance metric applied to the arguments

Usage

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points according to the k-means++ algorithm.  The basic idea is that
 * points are selected with probability proportional to their distance from any selected point.  In
 * this version, points have weights which multiply their likelihood of being selected.  This is the
 * same as if there were as many copies of the same point as indicated by the weight.
 *
 * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
 * The basic idea can be made much faster by only processing a random subset of the original points.
 * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
 * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea.  At
 * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
 * implementation.//w  w  w  .  j  av  a2s  . c  o m
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */
private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
    Preconditions.checkArgument(datapoints.size() > 1,
            "Must have at least two datapoints points to cluster " + "sensibly");
    Preconditions.checkArgument(datapoints.size() >= numClusters,
            String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
        center.update(row);
    }

    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double deltaX = 0;
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    for (WeightedVector row : datapoints) {
        deltaX += distanceMeasure.distance(row, center);
    }

    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.

    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
        double selectionProbability = deltaX
                + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
        seedSelector.add(i, selectionProbability);
    }

    int selected = random.nextInt(datapoints.size());
    Centroid c_1 = new Centroid(datapoints.get(selected).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
        WeightedVector row = datapoints.get(i);
        double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
        seedSelector.set(i, w);
    }

    // From here, seeds are selected with probability proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
        // Select according to weights.
        int seedIndex = seedSelector.sample();
        Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
        nextSeed.setIndex(clusterIndex++);
        centroids.add(nextSeed);
        // Don't select this one again.
        seedSelector.delete(seedIndex);
        // Re-weight everything according to the minimum distance to a seed.
        for (int currSeedIndex : seedSelector) {
            WeightedVector curr = datapoints.get(currSeedIndex);
            double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
            if (newWeight < seedSelector.getWeight(currSeedIndex)) {
                seedSelector.set(currSeedIndex, newWeight);
            }
        }
    }
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Examines the datapoints and updates cluster centers to be the centroid of the nearest datapoints points.  To
 * compute a new center for cluster c_i, we average all points that are closer than d_i * trimFraction
 * where d_i is/*from  ww w .j a  va 2s  .  co m*/
 *
 * d_i = min_j \sqrt ||c_j - c_i||^2
 *
 * By ignoring distant points, the centroids converge more quickly to a good approximation of the
 * optimal k-means solution (given good starting points).
 *
 * @param datapoints the points to cluster.
 */
private void iterativeAssignment(List<? extends WeightedVector> datapoints) {
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    // closestClusterDistances.get(i) is the distance from the i'th cluster to its closest
    // neighboring cluster.
    List<Double> closestClusterDistances = Lists.newArrayListWithExpectedSize(numClusters);
    // clusterAssignments[i] == j means that the i'th point is assigned to the j'th cluster. When
    // these don't change, we are done.
    // Each point is assigned to the invalid "-1" cluster initially.
    List<Integer> clusterAssignments = Lists.newArrayList(Collections.nCopies(datapoints.size(), -1));

    boolean changed = true;
    for (int i = 0; changed && i < maxNumIterations; i++) {
        changed = false;
        // We compute what the distance between each cluster and its closest neighbor is to set a
        // proportional distance threshold for points that should be involved in calculating the
        // centroid.
        closestClusterDistances.clear();

        for (Vector center : centroids) {
            // If a centroid has no points assigned to it, the clustering failed.

            Vector closestOtherCluster = centroids.searchFirst(center, true).getValue();
            closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster));
        }

        // Copies the current cluster centroids to newClusters and sets their weights to 0. This is
        // so we calculate the new centroids as we go through the datapoints.
        List<Centroid> newCentroids = Lists.newArrayList();
        for (Vector centroid : centroids) {
            // need a deep copy because we will mutate these values
            Centroid newCentroid = (Centroid) centroid.clone();
            newCentroid.setWeight(0);
            newCentroids.add(newCentroid);
        }

        // Pass over the datapoints computing new centroids.
        for (int j = 0; j < datapoints.size(); ++j) {
            WeightedVector datapoint = datapoints.get(j);
            // Get the closest cluster this point belongs to.
            WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
            int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
            double closestDistance = closestPair.getWeight();
            // Update its cluster assignment if necessary.
            if (closestIndex != clusterAssignments.get(j)) {
                changed = true;
                clusterAssignments.set(j, closestIndex);
            }
            // Only update if the datapoints point is near enough. What this means is that the weight
            // of outliers is NOT taken into account and the final weights of the centroids will
            // reflect this (it will be less or equal to the initial sum of the weights).
            if (closestDistance < trimFraction * closestClusterDistances.get(closestIndex)) {
                newCentroids.get(closestIndex).update(datapoint);
            }
        }
        // Add the new centers back into searcher.
        centroids.clear();
        centroids.addAll(newCentroids);
    }

    if (correctWeights) {
        for (Vector v : centroids) {
            ((Centroid) v).setWeight(0);
        }
        for (WeightedVector datapoint : datapoints) {
            Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
            closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
        }
    }
}

From source file:org.conan.mymahout.clustering.display.DisplaySpectralKMeans.java

License:Apache License

public static void main(String[] args) throws Exception {
    DistanceMeasure measure = new ManhattanDistanceMeasure();
    Path samples = new Path(SAMPLES);
    Path output = new Path(OUTPUT);
    Path tempDir = new Path(TEMP);
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, samples);/*from w  w  w.  ja v  a 2  s  . c o m*/
    HadoopUtil.delete(conf, output);

    RandomUtils.useTestSeed();
    DisplayClustering.generateSamples();
    writeSampleData(samples);
    Path affinities = new Path(output, AFFINITIES);
    FileSystem fs = FileSystem.get(output.toUri(), conf);
    if (!fs.exists(output)) {
        fs.mkdirs(output);
    }
    Writer writer = null;
    try {
        writer = Files.newWriter(new File(affinities.toString()), Charsets.UTF_8);
        for (int i = 0; i < SAMPLE_DATA.size(); i++) {
            for (int j = 0; j < SAMPLE_DATA.size(); j++) {
                writer.write(i + "," + j + ','
                        + measure.distance(SAMPLE_DATA.get(i).get(), SAMPLE_DATA.get(j).get()) + '\n');
            }
        }
    } finally {
        Closeables.close(writer, false);
    }
    int maxIter = 10;
    double convergenceDelta = 0.001;
    SpectralKMeansDriver.run(new Configuration(), affinities, output, SAMPLE_DATA.size(), 3, measure,
            convergenceDelta, maxIter, tempDir, false);
    new DisplaySpectralKMeans();
}