List of usage examples for org.apache.commons.math3.ml.clustering CentroidCluster getCenter
public Clusterable getCenter()
From source file:clustering.KMeans.java
public static void main(String[] args) throws UnknownHostException { if (args.length != 1) { System.out.println("Usage : KMeans <nrClusters>"); System.exit(-1);// w w w .j a v a 2 s.co m } int kClusters = Integer.parseInt(args[0]); ArrayList<Artist> artists = new ArrayList<Artist>(); DBHelper dbHelper = DBHelper.getInstance(); DBCursor result = dbHelper.findArtistsWithFBandTW(); while (result.hasNext()) { DBObject currentArtist = result.next(); artists.add(Artist.fromDBObject(currentArtist)); } //System.out.println(artists.size()); KMeansPlusPlusClusterer<Artist> clusterer = new KMeansPlusPlusClusterer<Artist>(kClusters); List<CentroidCluster<Artist>> clusters = clusterer.cluster(artists); //System.out.println(clusters.size()); dbHelper.emptyClusterCenters(); for (CentroidCluster<Artist> cluster : clusters) { double[] center = cluster.getCenter().getPoint(); ObjectId centerId = dbHelper.insertClusterCenter(center[0], center[1], center[2]); List<Artist> artC = cluster.getPoints(); for (Artist artist : artC) { dbHelper.updateMatrixRowCluster(artist.getDBObject(), centerId); //System.out.print("("+artist.fb_likes+","+artist.twitter_followers+","+artist.album_count+") "); } } }
From source file:indexer.DocClusterer.java
public String getClusterVecs() throws Exception { StringBuffer buff = new StringBuffer(); List<CentroidCluster<WordVec>> clusters = clusterWords(dvec.getWordMap(), numClusters); if (clusters == null) return ""; int i = 0;//ww w.j a v a2 s .co m for (CentroidCluster<WordVec> c : clusters) { //List<WordVec> thisClusterPoints = c.getPoints(); //WordVec clusterCenter = WordVecs.getCentroid(thisClusterPoints); Clusterable clusterCenter = c.getCenter(); WordVec clusterWordVec = new WordVec("Cluster_" + i, clusterCenter.getPoint()); //clusterCenter.setWord("Cluster_" + numClusters); buff.append(clusterWordVec.toString()).append(":"); i++; } return buff.toString(); }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException/* www . j a va2 s . co m*/ */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); System.out.println("Using " + docIDs.size() + " documents to build the vocabulary."); KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15); // fill the KMeans object: LinkedList<DoublePoint> features = new LinkedList<DoublePoint>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); // features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(new DoublePoint(f.getDoubleHistogram())); } } if (features.size() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(features.size())); System.out.println("Starting clustering ..."); List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features); // TODO: Serializing clusters to a file on the disk ... System.out.println("Clustering finished, " + clusterList.size() + " clusters found"); clusters = new LinkedList<double[]>(); for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) { CentroidCluster<DoublePoint> centroidCluster = iterator.next(); clusters.add(centroidCluster.getCenter().getPoint()); } System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); LireFeature f = getFeatureInstance(); for (int i = 0; i < reader.maxDoc(); i++) { try { if (reader.hasDeletions() && !liveDocs.get(i)) continue; for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0; } Document d = reader.document(i); IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature(f, clusters)]++; } // System.out.println(Arrays.toString(tmpHist)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(normalize(tmpHist)))); quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } catch (IOException e) { e.printStackTrace(); } } iw.commit(); // this one does the "old" commit(), it removes the deleted local features. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }
From source file:Clustering.technique.KMeansPlusPlusClusterer.java
/** * Returns the nearest {@link Cluster} to the given point * * @param clusters the {@link Cluster}s to search * @param point the point to find the nearest {@link Cluster} for * @return the index of the nearest {@link Cluster} to the given point *//*ww w . j a va2 s .c o m*/ private int getNearestCluster(final Collection<CentroidCluster<T>> clusters, final T point) { double minDistance = Double.MAX_VALUE; int clusterIndex = 0; int minCluster = 0; for (final CentroidCluster<T> c : clusters) { final double distance = distance(point, c.getCenter()); if (distance < minDistance) { minDistance = distance; minCluster = clusterIndex; } clusterIndex++; } return minCluster; }
From source file:Clustering.technique.KMeansPlusPlusClusterer.java
/** * Get the point farthest to its cluster center * * @param clusters the {@link Cluster}s to search * @return point farthest to its cluster center * @throws ConvergenceException if clusters are all empty *//*from w w w.ja v a2 s . com*/ private T getFarthestPoint(final Collection<CentroidCluster<T>> clusters) throws ConvergenceException { double maxDistance = Double.NEGATIVE_INFINITY; Cluster<T> selectedCluster = null; int selectedPoint = -1; for (final CentroidCluster<T> cluster : clusters) { // get the farthest point final Clusterable center = cluster.getCenter(); final List<T> points = cluster.getPoints(); for (int i = 0; i < points.size(); ++i) { final double distance = distance(points.get(i), center); if (distance > maxDistance) { maxDistance = distance; selectedCluster = cluster; selectedPoint = i; } } } // did we find at least one non-empty cluster ? if (selectedCluster == null) { throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); } return selectedCluster.getPoints().remove(selectedPoint); }
From source file:Clustering.technique.KMeansPlusPlusClusterer.java
/** * Get a random point from the {@link Cluster} with the largest distance variance. * * @param clusters the {@link Cluster}s to search * @return a random point from the selected cluster * @throws ConvergenceException if clusters are all empty *///from w w w.j a va 2 s .co m private T getPointFromLargestVarianceCluster(final Collection<CentroidCluster<T>> clusters) throws ConvergenceException { double maxVariance = Double.NEGATIVE_INFINITY; Cluster<T> selected = null; for (final CentroidCluster<T> cluster : clusters) { if (!cluster.getPoints().isEmpty()) { // compute the distance variance of the current cluster final Clusterable center = cluster.getCenter(); final Variance stat = new Variance(); for (final T point : cluster.getPoints()) { stat.increment(distance(point, center)); } final double variance = stat.getResult(); // select the cluster with the largest variance if (variance > maxVariance) { maxVariance = variance; selected = cluster; } } } // did we find at least one non-empty cluster ? if (selected == null) { throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); } // extract a random point from the cluster final List<T> selectedPoints = selected.getPoints(); return selectedPoints.remove(random.nextInt(selectedPoints.size())); }
From source file:Clustering.technique.KMeansPlusPlusClusterer.java
/** * Runs the K-means++ clustering algorithm. * * @param points the points to cluster// ww w .j a va2 s . co m * @return a list of clusters containing the points * @throws MathIllegalArgumentException if the data points are null or the number * of clusters is larger than the number of data points * @throws ConvergenceException if an empty cluster is encountered and the * {@link #emptyStrategy} is set to {@code ERROR} */ public List<CentroidCluster<T>> cluster(final Collection<T> points) throws MathIllegalArgumentException, ConvergenceException { // sanity checks MathUtils.checkNotNull(points); // number of clusters has to be smaller or equal the number of data points if (points.size() < k) { throw new NumberIsTooSmallException(points.size(), k, false); } // create the initial clusters List<CentroidCluster<T>> clusters = chooseInitialCenters(points); // create an array containing the latest assignment of a point to a cluster // no need to initialize the array, as it will be filled with the first assignment int[] assignments = new int[points.size()]; assignPointsToClusters(clusters, points, assignments); // iterate through updating the centers until we're done final int max = (maxIterations < 0) ? Integer.MAX_VALUE : maxIterations; for (int count = 0; count < max; count++) { boolean emptyCluster = false; List<CentroidCluster<T>> newClusters = new ArrayList<CentroidCluster<T>>(); for (final CentroidCluster<T> cluster : clusters) { final Clusterable newCenter; if (cluster.getPoints().isEmpty()) { switch (emptyStrategy) { case LARGEST_VARIANCE: newCenter = getPointFromLargestVarianceCluster(clusters); break; case LARGEST_POINTS_NUMBER: newCenter = getPointFromLargestNumberCluster(clusters); break; case FARTHEST_POINT: newCenter = getFarthestPoint(clusters); break; default: throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); } emptyCluster = true; } else { newCenter = centroidOf(cluster.getCenter(), cluster.getPoints(), cluster.getCenter().getPoint().length); } newClusters.add(new CentroidCluster<T>(newCenter)); } int changes = assignPointsToClusters(newClusters, points, assignments); clusters = newClusters; // if there were no more changes in the point-to-cluster assignment // and there are no empty clusters left, return the current clusters if (changes == 0 && !emptyCluster) { return clusters; } } return clusters; }
From source file:KMeansRecommender.MyKMeansPlusPlusClusterer.java
/** * Runs the K-means++ clustering algorithm. * * @param points the points to cluster/* w ww. ja va2 s . c o m*/ * @return a list of clusters containing the points * @throws MathIllegalArgumentException if the data points are null or the number * of clusters is larger than the number of data points * @throws ConvergenceException if an empty cluster is encountered and the * {@link #emptyStrategy} is set to {@code ERROR} */ public List<CentroidCluster<T>> cluster(final Collection<T> points) throws MathIllegalArgumentException, ConvergenceException { // sanity checks MathUtils.checkNotNull(points); // number of clusters has to be smaller or equal the number of data points if (points.size() < k) { throw new NumberIsTooSmallException(points.size(), k, false); } // create the initial clusters List<CentroidCluster<T>> clusters = chooseInitialCenters(points); // create an array containing the latest assignment of a point to a cluster // no need to initialize the array, as it will be filled with the first assignment int[] assignments = new int[points.size()]; assignPointsToClusters(clusters, points, assignments); // iterate through updating the centers until we're done int finalchange = 0; final int max = (maxIterations < 0) ? Integer.MAX_VALUE : maxIterations; for (int count = 0; count < max; count++) { boolean emptyCluster = false; List<CentroidCluster<T>> newClusters = new ArrayList<CentroidCluster<T>>(); for (final CentroidCluster<T> cluster : clusters) { final Clusterable newCenter; if (cluster.getPoints().isEmpty()) { switch (emptyStrategy) { case LARGEST_VARIANCE: newCenter = getPointFromLargestVarianceCluster(clusters); break; case LARGEST_POINTS_NUMBER: newCenter = getPointFromLargestNumberCluster(clusters); break; case FARTHEST_POINT: newCenter = getFarthestPoint(clusters); break; default: throw new ConvergenceException(LocalizedFormats.EMPTY_CLUSTER_IN_K_MEANS); } emptyCluster = true; } else { newCenter = centroidOf(cluster.getPoints(), cluster.getCenter().getPoint().length); } newClusters.add(new CentroidCluster<T>(newCenter)); } int changes = assignPointsToClusters(newClusters, points, assignments); clusters = newClusters; finalchange = changes; //for test // if there were no more changes in the point-to-cluster assignment // and there are no empty clusters left, return the current clusters if (changes == 0 && !emptyCluster) { //System.out.println("iteration time: " + count + ", changes : 0"); //for test return clusters; } } //System.out.println("iteration time: " + max + ", changes : " + finalchange); //for test return clusters; }
From source file:ec.coevolve.MultiPopCoevolutionaryEvaluatorExtra.java
protected Individual[] behaviourElite(EvolutionState state, int subpop) { // Generate the dataset ArrayList<IndividualClusterable> points = new ArrayList<IndividualClusterable>(); if (novelChampionsOrigin == NovelChampionsOrigin.halloffame) { for (int i = 0; i < hallOfFame[subpop].size(); i++) { points.add(new IndividualClusterable(hallOfFame[subpop].get(i), i)); }// w w w. j av a2 s . co m } else if (novelChampionsOrigin == NovelChampionsOrigin.archive) { for (ArchiveEntry ae : archives[subpop]) { points.add(new IndividualClusterable(ae.getIndividual(), ae.getGeneration())); } } // Cap -- only use the individuals with the highest fitness scores if (novelChampionsCap > 0) { // calculate the percentile DescriptiveStatistics ds = new DescriptiveStatistics(); for (IndividualClusterable ic : points) { ds.addValue(ic.getFitness()); } double percentile = ds.getPercentile(novelChampionsCap); // remove those below the percentile Iterator<IndividualClusterable> iter = points.iterator(); while (iter.hasNext()) { IndividualClusterable next = iter.next(); if (next.getFitness() < percentile) { iter.remove(); } } } // Check if there are enough points for clustering if (points.size() <= novelChampions) { Individual[] elite = new Individual[points.size()]; for (int i = 0; i < elite.length; i++) { elite[i] = points.get(i).getIndividual(); } return elite; } // Do the k-means clustering KMeansPlusPlusClusterer<IndividualClusterable> clusterer = new KMeansPlusPlusClusterer<IndividualClusterable>( novelChampions, 100); List<CentroidCluster<IndividualClusterable>> clusters = clusterer.cluster(points); // Return one from each cluster Individual[] elite = new Individual[novelChampions]; for (int i = 0; i < clusters.size(); i++) { CentroidCluster<IndividualClusterable> cluster = clusters.get(i); List<IndividualClusterable> clusterPoints = cluster.getPoints(); if (novelChampionsMode == NovelChampionsMode.random) { int randIndex = state.random[0].nextInt(clusterPoints.size()); elite[i] = clusterPoints.get(randIndex).getIndividual(); } else if (novelChampionsMode == NovelChampionsMode.last) { IndividualClusterable oldest = null; for (IndividualClusterable ic : clusterPoints) { if (oldest == null || ic.age > oldest.age) { oldest = ic; } } elite[i] = oldest.getIndividual(); } else if (novelChampionsMode == NovelChampionsMode.centroid) { DistanceMeasure dm = clusterer.getDistanceMeasure(); double[] centroid = cluster.getCenter().getPoint(); IndividualClusterable closest = null; double closestDist = Double.MAX_VALUE; for (IndividualClusterable ic : clusterPoints) { double dist = dm.compute(centroid, ic.getPoint()); if (dist < closestDist) { closestDist = dist; closest = ic; } } elite[i] = closest.getIndividual(); } else if (novelChampionsMode == NovelChampionsMode.best) { IndividualClusterable best = null; float highestFit = Float.NEGATIVE_INFINITY; for (IndividualClusterable ic : clusterPoints) { if (ic.getFitness() > highestFit) { best = ic; highestFit = ic.getFitness(); } } elite[i] = best.getIndividual(); } } return elite; }
From source file:org.apache.solr.client.solrj.io.eval.GetCentroidsEvaluator.java
@Override public Object doWork(Object value) throws IOException { if (!(value instanceof KmeansEvaluator.ClusterTuple)) { throw new IOException(String.format(Locale.ROOT, "Invalid expression %s - found type %s for value, expecting a clustering result", toExpression(constructingFactory), value.getClass().getSimpleName())); } else {/*from www . j ava2 s.c om*/ KmeansEvaluator.ClusterTuple clusterTuple = (KmeansEvaluator.ClusterTuple) value; List<CentroidCluster<KmeansEvaluator.ClusterPoint>> clusters = clusterTuple.getClusters(); double[][] data = new double[clusters.size()][]; for (int i = 0; i < clusters.size(); i++) { CentroidCluster<KmeansEvaluator.ClusterPoint> centroidCluster = clusters.get(i); Clusterable clusterable = centroidCluster.getCenter(); data[i] = clusterable.getPoint(); } Matrix centroids = new Matrix(data); centroids.setColumnLabels(clusterTuple.getColumnLabels()); return centroids; } }