Example usage for org.apache.mahout.math.random Multinomial Multinomial

List of usage examples for org.apache.mahout.math.random Multinomial Multinomial

Introduction

In this page you can find the example usage for org.apache.mahout.math.random Multinomial Multinomial.

Prototype

public Multinomial() 

Source Link

Usage

From source file:com.mapr.synth.samplers.ForeignKeySampler.java

License:Apache License

private void setup() {
    base = new Multinomial<>();
    for (int i = 0; i < size; i++) {
        base.add(i, Math.pow(i + 1.0, -skew));
    }/*from   w w w.  j a va  2 s.com*/
}

From source file:com.mapr.synth.samplers.NameSampler.java

License:Apache License

public NameSampler() {
    try {//from w  ww. j  av  a  2 s .c  om
        if (first.compareAndSet(null, new Multinomial<String>())) {
            Preconditions.checkState(last.getAndSet(new Multinomial<String>()) == null);

            Splitter onTab = Splitter.on(CharMatcher.WHITESPACE).omitEmptyStrings().trimResults();
            for (String resourceName : ImmutableList.of("dist.male.first", "dist.female.first")) {
                for (String line : Resources.readLines(Resources.getResource(resourceName), Charsets.UTF_8)) {
                    if (!line.startsWith("#")) {
                        Iterator<String> parts = onTab.split(line).iterator();
                        String name = initialCap(parts.next());
                        double weight = Double.parseDouble(parts.next());
                        if (first.get().getWeight(name) == 0) {
                            first.get().add(name, weight);
                        } else {
                            // do this instead of add because some first names may appear more than once
                            first.get().set(name, first.get().getWeight(name) + weight);
                        }
                    }
                }
            }

            for (String line : Resources.readLines(Resources.getResource("dist.all.last"), Charsets.UTF_8)) {
                if (!line.startsWith("#")) {
                    Iterator<String> parts = onTab.split(line).iterator();
                    String name = initialCap(parts.next());
                    double weight = Double.parseDouble(parts.next());
                    last.get().add(name, weight);
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Couldn't read built-in resource file", e);
    }
}

From source file:com.mapr.synth.samplers.StringSampler.java

License:Apache License

protected void readDistribution(String resourceName) {
    try {/* w w  w  .j av a2  s.com*/
        if (distribution.compareAndSet(null, new Multinomial<String>())) {
            Splitter onTab = Splitter.on("\t").trimResults();
            for (String line : Resources.readLines(Resources.getResource(resourceName), Charsets.UTF_8)) {
                if (!line.startsWith("#")) {
                    Iterator<String> parts = onTab.split(line).iterator();
                    String name = translate(parts.next());
                    double weight = Double.parseDouble(parts.next());
                    distribution.get().add(name, weight);
                }
            }
        }

    } catch (IOException e) {
        throw new RuntimeException("Couldn't read built-in resource file", e);
    }
}

From source file:com.mapr.synth.samplers.StringSampler.java

License:Apache License

public void setDist(Map<String, ?> dist) {
    Preconditions.checkArgument(dist.size() > 0);
    distribution.compareAndSet(null, new Multinomial<String>());
    for (String key : dist.keySet()) {
        distribution.get().add(key, Double.parseDouble(dist.get(key).toString()));
    }/*from w ww.  j  a  va  2 s  .co  m*/
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points randomly with probability proportional to their weights. This is much
 * less sophisticated than the kmeans++ approach, however it is faster and coupled with
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 *//*from  www  .j a  v  a  2 s.  c om*/
private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) {
    int numDatapoints = datapoints.size();
    double totalWeight = 0;
    for (WeightedVector datapoint : datapoints) {
        totalWeight += datapoint.getWeight();
    }
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < numDatapoints; ++i) {
        seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
    }
    for (int i = 0; i < numClusters; ++i) {
        int sample = seedSelector.sample();
        seedSelector.delete(sample);
        Centroid centroid = new Centroid(datapoints.get(sample));
        centroid.setIndex(i);
        centroids.add(centroid);
    }
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points according to the k-means++ algorithm.  The basic idea is that
 * points are selected with probability proportional to their distance from any selected point.  In
 * this version, points have weights which multiply their likelihood of being selected.  This is the
 * same as if there were as many copies of the same point as indicated by the weight.
 *
 * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
 * The basic idea can be made much faster by only processing a random subset of the original points.
 * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
 * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea.  At
 * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
 * implementation.//  www . ja  va  2s.c om
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */
private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
    Preconditions.checkArgument(datapoints.size() > 1,
            "Must have at least two datapoints points to cluster " + "sensibly");
    Preconditions.checkArgument(datapoints.size() >= numClusters,
            String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
        center.update(row);
    }

    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double deltaX = 0;
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    for (WeightedVector row : datapoints) {
        deltaX += distanceMeasure.distance(row, center);
    }

    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.

    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
        double selectionProbability = deltaX
                + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
        seedSelector.add(i, selectionProbability);
    }

    int selected = random.nextInt(datapoints.size());
    Centroid c_1 = new Centroid(datapoints.get(selected).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
        WeightedVector row = datapoints.get(i);
        double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
        seedSelector.set(i, w);
    }

    // From here, seeds are selected with probability proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
        // Select according to weights.
        int seedIndex = seedSelector.sample();
        Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
        nextSeed.setIndex(clusterIndex++);
        centroids.add(nextSeed);
        // Don't select this one again.
        seedSelector.delete(seedIndex);
        // Re-weight everything according to the minimum distance to a seed.
        for (int currSeedIndex : seedSelector) {
            WeightedVector curr = datapoints.get(currSeedIndex);
            double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
            if (newWeight < seedSelector.getWeight(currSeedIndex)) {
                seedSelector.set(currSeedIndex, newWeight);
            }
        }
    }
}