Example usage for org.apache.mahout.math.random Multinomial Multinomial

Introduction

In this page you can find the example usage for org.apache.mahout.math.random Multinomial Multinomial.

Prototype

public Multinomial()

Source Link

Usage

From source file:com.mapr.synth.samplers.ForeignKeySampler.java

License:Apache License

private void setup() {
    base = new Multinomial<>();
    for (int i = 0; i < size; i++) {
        base.add(i, Math.pow(i + 1.0, -skew));
    }/*from   w w w.  j a va  2 s.com*/
}

From source file:com.mapr.synth.samplers.NameSampler.java

License:Apache License

public NameSampler() {
    try {//from w  ww. j  av  a  2 s .c  om
        if (first.compareAndSet(null, new Multinomial<String>())) {
            Preconditions.checkState(last.getAndSet(new Multinomial<String>()) == null);

            Splitter onTab = Splitter.on(CharMatcher.WHITESPACE).omitEmptyStrings().trimResults();
            for (String resourceName : ImmutableList.of("dist.male.first", "dist.female.first")) {
                for (String line : Resources.readLines(Resources.getResource(resourceName), Charsets.UTF_8)) {
                    if (!line.startsWith("#")) {
                        Iterator<String> parts = onTab.split(line).iterator();
                        String name = initialCap(parts.next());
                        double weight = Double.parseDouble(parts.next());
                        if (first.get().getWeight(name) == 0) {
                            first.get().add(name, weight);
                        } else {
                            // do this instead of add because some first names may appear more than once
                            first.get().set(name, first.get().getWeight(name) + weight);
                        }
                    }
                }
            }

            for (String line : Resources.readLines(Resources.getResource("dist.all.last"), Charsets.UTF_8)) {
                if (!line.startsWith("#")) {
                    Iterator<String> parts = onTab.split(line).iterator();
                    String name = initialCap(parts.next());
                    double weight = Double.parseDouble(parts.next());
                    last.get().add(name, weight);
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Couldn't read built-in resource file", e);
    }
}

From source file:com.mapr.synth.samplers.StringSampler.java

License:Apache License

protected void readDistribution(String resourceName) {
    try {/* w w  w  .j av a2  s.com*/
        if (distribution.compareAndSet(null, new Multinomial<String>())) {
            Splitter onTab = Splitter.on("\t").trimResults();
            for (String line : Resources.readLines(Resources.getResource(resourceName), Charsets.UTF_8)) {
                if (!line.startsWith("#")) {
                    Iterator<String> parts = onTab.split(line).iterator();
                    String name = translate(parts.next());
                    double weight = Double.parseDouble(parts.next());
                    distribution.get().add(name, weight);
                }
            }
        }

    } catch (IOException e) {
        throw new RuntimeException("Couldn't read built-in resource file", e);
    }
}

From source file:com.mapr.synth.samplers.StringSampler.java

License:Apache License

public void setDist(Map<String, ?> dist) {
    Preconditions.checkArgument(dist.size() > 0);
    distribution.compareAndSet(null, new Multinomial<String>());
    for (String key : dist.keySet()) {
        distribution.get().add(key, Double.parseDouble(dist.get(key).toString()));
    }/*from w ww.  j  a  va  2 s  .co  m*/
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points randomly with probability proportional to their weights. This is much
 * less sophisticated than the kmeans++ approach, however it is faster and coupled with
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 *//*from  www  .j a  v  a  2 s.  c om*/
private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) {
    int numDatapoints = datapoints.size();
    double totalWeight = 0;
    for (WeightedVector datapoint : datapoints) {
        totalWeight += datapoint.getWeight();
    }
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < numDatapoints; ++i) {
        seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
    }
    for (int i = 0; i < numClusters; ++i) {
        int sample = seedSelector.sample();
        seedSelector.delete(sample);
        Centroid centroid = new Centroid(datapoints.get(sample));
        centroid.setIndex(i);
        centroids.add(centroid);
    }
}

From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java

License:Apache License

/**
 * Selects some of the original points according to the k-means++ algorithm.  The basic idea is that
 * points are selected with probability proportional to their distance from any selected point.  In
 * this version, points have weights which multiply their likelihood of being selected.  This is the
 * same as if there were as many copies of the same point as indicated by the weight.
 *
 * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm.
 * The basic idea can be made much faster by only processing a random subset of the original points.
 * In the context of streaming k-means, the total number of possible seeds will be about k log n so this
 * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea.  At
 * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling
 * implementation.//  www . ja  va  2s.c om
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */
private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) {
    Preconditions.checkArgument(datapoints.size() > 1,
            "Must have at least two datapoints points to cluster " + "sensibly");
    Preconditions.checkArgument(datapoints.size() >= numClusters,
            String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters));
    // Compute the centroid of all of the datapoints.  This is then used to compute the squared radius of the datapoints.
    Centroid center = new Centroid(datapoints.iterator().next());
    for (WeightedVector row : Iterables.skip(datapoints, 1)) {
        center.update(row);
    }

    // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints
    // this accelerates seed selection.
    double deltaX = 0;
    DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
    for (WeightedVector row : datapoints) {
        deltaX += distanceMeasure.distance(row, center);
    }

    // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that
    // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2.  This is done
    // by first selecting c_1 with probability:
    //
    // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2
    //
    // This can be simplified to:
    //
    // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X))
    //
    // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2
    //
    // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability
    // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2.

    // Multinomial distribution of vector indices for the selection seeds. These correspond to
    // the indices of the vectors in the original datapoints list.
    Multinomial<Integer> seedSelector = new Multinomial<Integer>();
    for (int i = 0; i < datapoints.size(); ++i) {
        double selectionProbability = deltaX
                + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center);
        seedSelector.add(i, selectionProbability);
    }

    int selected = random.nextInt(datapoints.size());
    Centroid c_1 = new Centroid(datapoints.get(selected).clone());
    c_1.setIndex(0);
    // Construct a set of weighted things which can be used for random selection.  Initial weights are
    // set to the squared distance from c_1
    for (int i = 0; i < datapoints.size(); ++i) {
        WeightedVector row = datapoints.get(i);
        double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight());
        seedSelector.set(i, w);
    }

    // From here, seeds are selected with probability proportional to:
    //
    // r_i = min_{c_j} || x_i - c_j ||^2
    //
    // when we only have c_1, we have already set these distances and as we select each new
    // seed, we update the minimum distances.
    centroids.add(c_1);
    int clusterIndex = 1;
    while (centroids.size() < numClusters) {
        // Select according to weights.
        int seedIndex = seedSelector.sample();
        Centroid nextSeed = new Centroid(datapoints.get(seedIndex));
        nextSeed.setIndex(clusterIndex++);
        centroids.add(nextSeed);
        // Don't select this one again.
        seedSelector.delete(seedIndex);
        // Re-weight everything according to the minimum distance to a seed.
        for (int currSeedIndex : seedSelector) {
            WeightedVector curr = datapoints.get(currSeedIndex);
            double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr);
            if (newWeight < seedSelector.getWeight(currSeedIndex)) {
                seedSelector.set(currSeedIndex, newWeight);
            }
        }
    }
}