List of usage examples for org.apache.mahout.math.random Multinomial Multinomial
public Multinomial()
From source file:com.mapr.synth.samplers.ForeignKeySampler.java
License:Apache License
private void setup() { base = new Multinomial<>(); for (int i = 0; i < size; i++) { base.add(i, Math.pow(i + 1.0, -skew)); }/*from w w w. j a va 2 s.com*/ }
From source file:com.mapr.synth.samplers.NameSampler.java
License:Apache License
public NameSampler() { try {//from w ww. j av a 2 s .c om if (first.compareAndSet(null, new Multinomial<String>())) { Preconditions.checkState(last.getAndSet(new Multinomial<String>()) == null); Splitter onTab = Splitter.on(CharMatcher.WHITESPACE).omitEmptyStrings().trimResults(); for (String resourceName : ImmutableList.of("dist.male.first", "dist.female.first")) { for (String line : Resources.readLines(Resources.getResource(resourceName), Charsets.UTF_8)) { if (!line.startsWith("#")) { Iterator<String> parts = onTab.split(line).iterator(); String name = initialCap(parts.next()); double weight = Double.parseDouble(parts.next()); if (first.get().getWeight(name) == 0) { first.get().add(name, weight); } else { // do this instead of add because some first names may appear more than once first.get().set(name, first.get().getWeight(name) + weight); } } } } for (String line : Resources.readLines(Resources.getResource("dist.all.last"), Charsets.UTF_8)) { if (!line.startsWith("#")) { Iterator<String> parts = onTab.split(line).iterator(); String name = initialCap(parts.next()); double weight = Double.parseDouble(parts.next()); last.get().add(name, weight); } } } } catch (IOException e) { throw new RuntimeException("Couldn't read built-in resource file", e); } }
From source file:com.mapr.synth.samplers.StringSampler.java
License:Apache License
protected void readDistribution(String resourceName) { try {/* w w w .j av a2 s.com*/ if (distribution.compareAndSet(null, new Multinomial<String>())) { Splitter onTab = Splitter.on("\t").trimResults(); for (String line : Resources.readLines(Resources.getResource(resourceName), Charsets.UTF_8)) { if (!line.startsWith("#")) { Iterator<String> parts = onTab.split(line).iterator(); String name = translate(parts.next()); double weight = Double.parseDouble(parts.next()); distribution.get().add(name, weight); } } } } catch (IOException e) { throw new RuntimeException("Couldn't read built-in resource file", e); } }
From source file:com.mapr.synth.samplers.StringSampler.java
License:Apache License
public void setDist(Map<String, ?> dist) { Preconditions.checkArgument(dist.size() > 0); distribution.compareAndSet(null, new Multinomial<String>()); for (String key : dist.keySet()) { distribution.get().add(key, Double.parseDouble(dist.get(key).toString())); }/*from w ww. j a va 2 s .co m*/ }
From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java
License:Apache License
/** * Selects some of the original points randomly with probability proportional to their weights. This is much * less sophisticated than the kmeans++ approach, however it is faster and coupled with * * The side effect of this method is to fill the centroids structure itself. * * @param datapoints The datapoints to select from. These datapoints should be WeightedVectors of some kind. *//*from www .j a v a 2 s. c om*/ private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) { int numDatapoints = datapoints.size(); double totalWeight = 0; for (WeightedVector datapoint : datapoints) { totalWeight += datapoint.getWeight(); } Multinomial<Integer> seedSelector = new Multinomial<Integer>(); for (int i = 0; i < numDatapoints; ++i) { seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight); } for (int i = 0; i < numClusters; ++i) { int sample = seedSelector.sample(); seedSelector.delete(sample); Centroid centroid = new Centroid(datapoints.get(sample)); centroid.setIndex(i); centroids.add(centroid); } }
From source file:eu.stratosphere.library.clustering.DistributedOnePassKMeans.BallKMeans.java
License:Apache License
/** * Selects some of the original points according to the k-means++ algorithm. The basic idea is that * points are selected with probability proportional to their distance from any selected point. In * this version, points have weights which multiply their likelihood of being selected. This is the * same as if there were as many copies of the same point as indicated by the weight. * * This is pretty expensive, but it vastly improves the quality and convergences of the k-means algorithm. * The basic idea can be made much faster by only processing a random subset of the original points. * In the context of streaming k-means, the total number of possible seeds will be about k log n so this * selection will cost O(k^2 (log n)^2) which isn't much worse than the random sampling idea. At * n = 10^9, the cost of this initialization will be about 10x worse than a reasonable random sampling * implementation.// www . ja va 2s.c om * * The side effect of this method is to fill the centroids structure itself. * * @param datapoints The datapoints to select from. These datapoints should be WeightedVectors of some kind. */ private void initializeSeedsKMeansPlusPlus(List<? extends WeightedVector> datapoints) { Preconditions.checkArgument(datapoints.size() > 1, "Must have at least two datapoints points to cluster " + "sensibly"); Preconditions.checkArgument(datapoints.size() >= numClusters, String.format("Must have more datapoints [%d] than clusters [%d]", datapoints.size(), numClusters)); // Compute the centroid of all of the datapoints. This is then used to compute the squared radius of the datapoints. Centroid center = new Centroid(datapoints.iterator().next()); for (WeightedVector row : Iterables.skip(datapoints, 1)) { center.update(row); } // Given the centroid, we can compute \Delta_1^2(X), the total squared distance for the datapoints // this accelerates seed selection. double deltaX = 0; DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); for (WeightedVector row : datapoints) { deltaX += distanceMeasure.distance(row, center); } // Find the first seed c_1 (and conceptually the second, c_2) as might be done in the 2-means clustering so that // the probability of selecting c_1 and c_2 is proportional to || c_1 - c_2 ||^2. This is done // by first selecting c_1 with probability: // // p(c_1) = sum_{c_1} || c_1 - c_2 ||^2 \over sum_{c_1, c_2} || c_1 - c_2 ||^2 // // This can be simplified to: // // p(c_1) = \Delta_1^2(X) + n || c_1 - c ||^2 / (2 n \Delta_1^2(X)) // // where c = \sum x / n and \Delta_1^2(X) = sum || x - c ||^2 // // All subsequent seeds c_i (including c_2) can then be selected from the remaining points with probability // proportional to Pr(c_i == x_j) = min_{m < i} || c_m - x_j ||^2. // Multinomial distribution of vector indices for the selection seeds. These correspond to // the indices of the vectors in the original datapoints list. Multinomial<Integer> seedSelector = new Multinomial<Integer>(); for (int i = 0; i < datapoints.size(); ++i) { double selectionProbability = deltaX + datapoints.size() * distanceMeasure.distance(datapoints.get(i), center); seedSelector.add(i, selectionProbability); } int selected = random.nextInt(datapoints.size()); Centroid c_1 = new Centroid(datapoints.get(selected).clone()); c_1.setIndex(0); // Construct a set of weighted things which can be used for random selection. Initial weights are // set to the squared distance from c_1 for (int i = 0; i < datapoints.size(); ++i) { WeightedVector row = datapoints.get(i); double w = distanceMeasure.distance(c_1, row) * 2 * Math.log(1 + row.getWeight()); seedSelector.set(i, w); } // From here, seeds are selected with probability proportional to: // // r_i = min_{c_j} || x_i - c_j ||^2 // // when we only have c_1, we have already set these distances and as we select each new // seed, we update the minimum distances. centroids.add(c_1); int clusterIndex = 1; while (centroids.size() < numClusters) { // Select according to weights. int seedIndex = seedSelector.sample(); Centroid nextSeed = new Centroid(datapoints.get(seedIndex)); nextSeed.setIndex(clusterIndex++); centroids.add(nextSeed); // Don't select this one again. seedSelector.delete(seedIndex); // Re-weight everything according to the minimum distance to a seed. for (int currSeedIndex : seedSelector) { WeightedVector curr = datapoints.get(currSeedIndex); double newWeight = nextSeed.getWeight() * distanceMeasure.distance(nextSeed, curr); if (newWeight < seedSelector.getWeight(currSeedIndex)) { seedSelector.set(currSeedIndex, newWeight); } } } }