List of usage examples for org.apache.commons.math3.ml.clustering KMeansPlusPlusClusterer cluster
@Override public List<CentroidCluster<T>> cluster(final Collection<T> points) throws MathIllegalArgumentException, ConvergenceException
From source file:clustering.KMeans.java
public static void main(String[] args) throws UnknownHostException { if (args.length != 1) { System.out.println("Usage : KMeans <nrClusters>"); System.exit(-1);//from w ww.ja v a 2 s .c om } int kClusters = Integer.parseInt(args[0]); ArrayList<Artist> artists = new ArrayList<Artist>(); DBHelper dbHelper = DBHelper.getInstance(); DBCursor result = dbHelper.findArtistsWithFBandTW(); while (result.hasNext()) { DBObject currentArtist = result.next(); artists.add(Artist.fromDBObject(currentArtist)); } //System.out.println(artists.size()); KMeansPlusPlusClusterer<Artist> clusterer = new KMeansPlusPlusClusterer<Artist>(kClusters); List<CentroidCluster<Artist>> clusters = clusterer.cluster(artists); //System.out.println(clusters.size()); dbHelper.emptyClusterCenters(); for (CentroidCluster<Artist> cluster : clusters) { double[] center = cluster.getCenter().getPoint(); ObjectId centerId = dbHelper.insertClusterCenter(center[0], center[1], center[2]); List<Artist> artC = cluster.getPoints(); for (Artist artist : artC) { dbHelper.updateMatrixRowCluster(artist.getDBObject(), centerId); //System.out.print("("+artist.fb_likes+","+artist.twitter_followers+","+artist.album_count+") "); } } }
From source file:edu.byu.nlp.data.app.AnnotationStream2Annotators.java
/** * This returns a set of clustered annotator parameters. Averaging them yields the centroid of the cluster. * Note that Annotator.clusterAssignment properties are change IN PLACE. */// w w w. j a v a 2 s . c om private static void assignKMeansClusters(List<Annotator> annotators, int k, int maxIterations, RandomGenerator rnd) { Preconditions.checkNotNull(annotators); Preconditions.checkArgument(annotators.size() > 0); KMeansPlusPlusClusterer<Annotator> clusterer = new KMeansPlusPlusClusterer<>(k, maxIterations, new EuclideanDistance(), rnd); List<CentroidCluster<Annotator>> clusterCentroids = clusterer.cluster(annotators); for (int c = 0; c < clusterCentroids.size(); c++) { for (Annotator annotator : clusterCentroids.get(c).getPoints()) { // note: we don't return the centroid point here because averaging the points in the cluster // yields precisely the centroid point. // stick this annotator in this location in the confusions annotator.clusterAssignment = c; } } }
From source file:bigdataproject.KMeansKFinder.java
public int find(double epsilon) { double oldAvDist = 0.0; for (int k = 2; k < numSamples; k++) { KMeansPlusPlusClusterer kmeans = new KMeansPlusPlusClusterer(k, 1000, new EuclideanDistance()); List<Cluster<DoublePoint>> clusterList = kmeans.cluster(list); double[] avDistances = new double[k]; int index = 0; for (Cluster<DoublePoint> c : clusterList) { List cluster = c.getPoints(); int size = cluster.size(); double[] centroid = getCentroid(cluster); double distanceSum = 0.0; for (Object p : cluster) { DoublePoint point = (DoublePoint) p; double[] pointDouble = point.getPoint(); EuclideanDistance dist = new EuclideanDistance(); distanceSum += dist.compute(centroid, pointDouble); }//from w w w .ja v a2 s. com avDistances[index] = distanceSum / size; index++; } double avDistSum = 0.0; for (int i = 0; i < avDistances.length; i++) { avDistSum += avDistances[i]; } double newAvDist = avDistSum / avDistances.length; double difference = Math.abs(newAvDist - oldAvDist); if (difference >= epsilon) { oldAvDist = newAvDist; } else return k - 1; } return 0; }
From source file:indexer.DocClusterer.java
public List<CentroidCluster<WordVec>> clusterWords(HashMap<String, WordVec> wvecMap, int numClusters) throws Exception { System.out.println("Clustering document: " + dvec.getDocId()); List<WordVec> wordList = new ArrayList<>(wvecMap.size()); for (Map.Entry<String, WordVec> e : wvecMap.entrySet()) { wordList.add(e.getValue());/*w ww .j a v a 2s. co m*/ } KMeansPlusPlusClusterer<WordVec> clusterer = new KMeansPlusPlusClusterer<>( Math.min(numClusters, wordList.size())); if (wordList.size() == 0) return null; List<CentroidCluster<WordVec>> clusters = clusterer.cluster(wordList); return clusters; }
From source file:msi.gaml.operators.Stats.java
@operator(value = "kmeans", can_be_const = false, type = IType.LIST, category = { IOperatorCategory.STATISTICAL }, concept = { IConcept.STATISTIC, IConcept.CLUSTERING }) @doc(value = "returns the list of clusters (list of instance indices) computed with the kmeans++ algorithm from the first operand data according to the number of clusters to split the data into (k) and the maximum number of iterations to run the algorithm for (If negative, no maximum will be used) (maxIt). Usage: kmeans(data,k,maxit)", special_cases = "if the lengths of two vectors in the right-hand aren't equal, returns 0", examples = { @example(value = "kmeans ([[2,4,5], [3,8,2], [1,1,3], [4,3,4]],2,10)", isExecutable = false) }) public static GamaList<GamaList> KMeansPlusplusApache(final IScope scope, final GamaList data, final Integer k, final Integer maxIt) throws GamaRuntimeException { final MersenneTwister rand = new MersenneTwister(scope.getRandom().getSeed().longValue()); final List<DoublePoint> instances = new ArrayList<>(); for (int i = 0; i < data.size(); i++) { final GamaList d = (GamaList) data.get(i); final double point[] = new double[d.size()]; for (int j = 0; j < d.size(); j++) { point[j] = Cast.asFloat(scope, d.get(j)); }// ww w . j a va 2 s.c om instances.add(new Instance(i, point)); } final KMeansPlusPlusClusterer<DoublePoint> kmeans = new KMeansPlusPlusClusterer<>(k, maxIt, new EuclideanDistance(), rand); final List<CentroidCluster<DoublePoint>> clusters = kmeans.cluster(instances); final GamaList results = (GamaList) GamaListFactory.create(); for (final Cluster<DoublePoint> cl : clusters) { final GamaList clG = (GamaList) GamaListFactory.create(); for (final DoublePoint pt : cl.getPoints()) { clG.addValue(scope, ((Instance) pt).getId()); } results.addValue(scope, clG); } return results; }
From source file:gedi.atac.Atac.java
public static void testInPeaks(GenomicRegionStorage<? extends AlignedReadsData> storage, String contrasts, String peakFile, String rmq, String compOut, String bicOut, String out, boolean randomizeContrasts) throws IOException { DiskGenomicNumericBuilder clusterRmq = new DiskGenomicNumericBuilder(rmq); LineIterator it = new LineOrientedFile(peakFile).lineIterator(); LineOrientedFile o = new LineOrientedFile(out); o.startWriting();// w w w.jav a2s.c o m o.writef("%s\tComponents\tp.value\n", it.next()); int offset = 4; ContrastMapping contr = new ContrastMapping(); ExtendedIterator<String> coit = new LineOrientedFile(contrasts).lineIterator(); if (randomizeContrasts) { String[] ca = coit.toArray(new String[0]); ArrayUtils.shuffleSlice(ca, 0, ca.length); coit = FunctorUtils.arrayIterator(ca); } coit.forEachRemaining( l -> contr.addMapping(contr.getNumOriginalConditions(), contr.getMappedIndexOrNext(l), l)); LineOrientedFile co = new LineOrientedFile(compOut); co.startWriting(); co.writef("Peak\tComponent"); for (int i = 0; i < contr.getNumMergedConditions(); i++) co.writef("\t%s", contr.getMappedName(i)); co.writeLine(); LineOrientedFile bico = new LineOrientedFile(bicOut); bico.startWriting(); bico.writef("Peak\tk\tBIC\n"); Progress pr = new ConsoleProgress(); pr.init(); int peakCount = (int) new LineOrientedFile(peakFile).lineIterator().count() - 1; pr.setCount(peakCount); while (it.hasNext()) { String line = it.next(); ImmutableReferenceGenomicRegion<Object> peak = ImmutableReferenceGenomicRegion .parse(StringUtils.splitField(line, '\t', 0)); pr.setDescription(peak.toString()); pr.incrementProgress(); HashMap<FixedDoublePoint, Integer> pToPos = new HashMap<FixedDoublePoint, Integer>(); FixedDoublePoint[] m = new FixedDoublePoint[peak.getRegion().getTotalLength()]; for (int i = 0; i < m.length; i++) { m[i] = new FixedDoublePoint(new double[contr.getNumMergedConditions()]); pToPos.put(m[i], peak.getRegion().map(i)); } Consumer<MutableReferenceGenomicRegion<? extends AlignedReadsData>> adder = new Consumer<MutableReferenceGenomicRegion<? extends AlignedReadsData>>() { @Override public void accept(MutableReferenceGenomicRegion<? extends AlignedReadsData> mrgr) { try { int start = GenomicRegionPosition.Start.position(mrgr.getReference(), mrgr.getRegion(), offset); if (peak.getRegion().contains(start)) addDownsampled(contr, m[peak.getRegion().induce(start)].getPoint(), mrgr.getData().getTotalCountsForConditions(ReadCountMode.All)); int stop = GenomicRegionPosition.Stop.position(mrgr.getReference(), mrgr.getRegion(), -offset); if (peak.getRegion().contains(stop)) addDownsampled(contr, m[peak.getRegion().induce(stop)].getPoint(), mrgr.getData().getTotalCountsForConditions(ReadCountMode.All)); } catch (Exception e) { throw new RuntimeException(e); } } private void addDownsampled(ContrastMapping contr, double[] re, double[] c) { double max = ArrayUtils.max(c); if (max > 0) ArrayUtils.mult(c, 1 / max); for (int i = 0; i < c.length; i++) if (contr.getMappedIndex(i) > -1) re[contr.getMappedIndex(i)] += c[i]; } }; storage.iterateIntersectingMutableReferenceGenomicRegions(peak.getReference().toPlusStrand(), peak.getRegion()).forEachRemaining(adder); storage.iterateIntersectingMutableReferenceGenomicRegions(peak.getReference().toMinusStrand(), peak.getRegion()).forEachRemaining(adder); // double[] total = new double[cond]; // for (int i=0; i<m.length; i++) // for (int j=0; j<cond; j++) // total[j]+=m[i].getPoint()[j]; // ArrayUtils.normalize(total); // // double ll = 0; // for (int i=0; i<m.length; i++) // ll+=ddirichlet1(m[i].getPoint(), total); // DoubleArrayList ll = new DoubleArrayList(); ll.add(0); DoubleArrayList bic = new DoubleArrayList(); bic.add(0); ArrayList<FixedDoublePoint> list = new ArrayList<FixedDoublePoint>(); for (FixedDoublePoint p : m) if (ArrayUtils.sum(p.getPoint()) > 0) list.add(p); List<CentroidCluster<FixedDoublePoint>> ocl = null; double op = 0; for (int k = 1; k < Math.min(list.size(), 50); k++) { KMeansPlusPlusClusterer<FixedDoublePoint> kmeans = new KMeansPlusPlusClusterer<FixedDoublePoint>(k); List<CentroidCluster<FixedDoublePoint>> cl = kmeans.cluster(list); double cll = 0; for (CentroidCluster<FixedDoublePoint> c : cl) { double[] total = new double[contr.getNumMergedConditions()]; Arrays.fill(total, 1); for (FixedDoublePoint p : c.getPoints()) for (int j = 0; j < contr.getNumMergedConditions(); j++) total[j] += p.getPoint()[j]; ArrayUtils.normalize(total); for (FixedDoublePoint p : c.getPoints()) cll += ddirichlet1(p.getPoint(), total); } // LLR test double LLR = 2 * cll - 2 * ll.getLastDouble(); double p = 1 - new ChiSquaredDistribution(contr.getNumMergedConditions() - 1).cumulativeProbability(LLR); bic.add(-2 * cll + 2 * (contr.getNumMergedConditions() - 1) * k); bico.writef("%s\t%d\t%.1f\n", peak.toLocationString(), k, bic.getLastDouble()); // bonferroni correction p = p * peakCount; if (p > 0.01) { if (ocl.size() > 1) { for (int i = 0; i < ocl.size(); i++) { co.writef("%s\t%d", peak.toLocationString(), i); double[] total = new double[contr.getNumMergedConditions()]; Arrays.fill(total, 1); for (FixedDoublePoint pp : ocl.get(i).getPoints()) { clusterRmq.addValue(peak.getReference(), pToPos.get(pp).intValue(), (byte) i); for (int j = 0; j < contr.getNumMergedConditions(); j++) total[j] += pp.getPoint()[j]; } ArrayUtils.normalize(total); for (int c = 0; c < contr.getNumMergedConditions(); c++) co.writef("\t%.4f", total[c]); co.writeLine(); } } break; } ll.add(cll); ocl = cl; op = p; } o.writef("%s\t%d\t%.4g\n", line, ll.size() - 1, ll.size() == 2 ? Double.NaN : op); } pr.finish(); o.finishWriting(); co.finishWriting(); clusterRmq.build(); }
From source file:bigdataproject.MainJFrame.java
private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton1ActionPerformed this.jLabel8.setText(""); ReadDataSet read = new ReadDataSet(); read.readFromFile();//from w w w .j ava 2 s. c om read.filter(); matrix = read.getMatrix(); PCA pca = new PCA(matrix); double[][] matrix2DPCA = pca.reduceDimensions(); BlockRealMatrix pcaMatrix = new BlockRealMatrix(matrix2DPCA); BlockRealMatrix pcaMatrixTranspose = pcaMatrix.transpose(); List<DoublePoint> list = read.getCollection(read.getHashMap(pcaMatrixTranspose.getData())); List<Cluster<DoublePoint>> clusterList; if (kMeans) { int k; if (this.jCheckBox1.isSelected()) { KMeansKFinder kFinder = new KMeansKFinder(list); k = kFinder.find(0.15); } else k = (int) this.jSpinner1.getValue(); KMeansPlusPlusClusterer kmeans = new KMeansPlusPlusClusterer(k, 1000, new EuclideanDistance()); clusterList = kmeans.cluster(list); } else { int minPts; double eps; if (this.jCheckBox2.isSelected()) { minPts = 6; //KDistances dist = new KDistances(pcaMatrixTranspose.getData()); //dist.calculateDistances(); //dist.getKSortedNearestNeighbors(minPts); //dist.printKdistances(); eps = 1.0; } else { minPts = (int) this.jSpinner2.getValue(); try { eps = Double.parseDouble(this.jTextField1.getText()); } catch (NumberFormatException e) { this.jLabel8.setText("Wrong eps Value"); return; } } DBSCANClusterer dbscan = new DBSCANClusterer(eps, minPts); clusterList = dbscan.cluster(list); } final ScatterPlot demo = new ScatterPlot("Big Data Clustering Project", matrix2DPCA, clusterList); demo.pack(); RefineryUtilities.centerFrameOnScreen(demo); demo.setVisible(true); }
From source file:ec.coevolve.MultiPopCoevolutionaryEvaluatorExtra.java
protected Individual[] behaviourElite(EvolutionState state, int subpop) { // Generate the dataset ArrayList<IndividualClusterable> points = new ArrayList<IndividualClusterable>(); if (novelChampionsOrigin == NovelChampionsOrigin.halloffame) { for (int i = 0; i < hallOfFame[subpop].size(); i++) { points.add(new IndividualClusterable(hallOfFame[subpop].get(i), i)); }// w w w .ja v a2 s . c o m } else if (novelChampionsOrigin == NovelChampionsOrigin.archive) { for (ArchiveEntry ae : archives[subpop]) { points.add(new IndividualClusterable(ae.getIndividual(), ae.getGeneration())); } } // Cap -- only use the individuals with the highest fitness scores if (novelChampionsCap > 0) { // calculate the percentile DescriptiveStatistics ds = new DescriptiveStatistics(); for (IndividualClusterable ic : points) { ds.addValue(ic.getFitness()); } double percentile = ds.getPercentile(novelChampionsCap); // remove those below the percentile Iterator<IndividualClusterable> iter = points.iterator(); while (iter.hasNext()) { IndividualClusterable next = iter.next(); if (next.getFitness() < percentile) { iter.remove(); } } } // Check if there are enough points for clustering if (points.size() <= novelChampions) { Individual[] elite = new Individual[points.size()]; for (int i = 0; i < elite.length; i++) { elite[i] = points.get(i).getIndividual(); } return elite; } // Do the k-means clustering KMeansPlusPlusClusterer<IndividualClusterable> clusterer = new KMeansPlusPlusClusterer<IndividualClusterable>( novelChampions, 100); List<CentroidCluster<IndividualClusterable>> clusters = clusterer.cluster(points); // Return one from each cluster Individual[] elite = new Individual[novelChampions]; for (int i = 0; i < clusters.size(); i++) { CentroidCluster<IndividualClusterable> cluster = clusters.get(i); List<IndividualClusterable> clusterPoints = cluster.getPoints(); if (novelChampionsMode == NovelChampionsMode.random) { int randIndex = state.random[0].nextInt(clusterPoints.size()); elite[i] = clusterPoints.get(randIndex).getIndividual(); } else if (novelChampionsMode == NovelChampionsMode.last) { IndividualClusterable oldest = null; for (IndividualClusterable ic : clusterPoints) { if (oldest == null || ic.age > oldest.age) { oldest = ic; } } elite[i] = oldest.getIndividual(); } else if (novelChampionsMode == NovelChampionsMode.centroid) { DistanceMeasure dm = clusterer.getDistanceMeasure(); double[] centroid = cluster.getCenter().getPoint(); IndividualClusterable closest = null; double closestDist = Double.MAX_VALUE; for (IndividualClusterable ic : clusterPoints) { double dist = dm.compute(centroid, ic.getPoint()); if (dist < closestDist) { closestDist = dist; closest = ic; } } elite[i] = closest.getIndividual(); } else if (novelChampionsMode == NovelChampionsMode.best) { IndividualClusterable best = null; float highestFit = Float.NEGATIVE_INFINITY; for (IndividualClusterable ic : clusterPoints) { if (ic.getFitness() > highestFit) { best = ic; highestFit = ic.getFitness(); } } elite[i] = best.getIndividual(); } } return elite; }
From source file:edu.nyu.vida.data_polygamy.ctdata.TopologicalIndex.java
public double getThreshold(Feature[] f) { KMeansPlusPlusClusterer<DoublePoint> kmeans = new KMeansPlusPlusClusterer<DoublePoint>(2, 1000); ArrayList<DoublePoint> pts = new ArrayList<DoublePoint>(); if (f.length < 2) { return f[0].wt * 0.4; }/* www .j a v a 2 s . c o m*/ for (int i = 0; i < f.length; i++) { DoublePoint dpt = new DoublePoint(new double[] { f[i].wt }); pts.add(dpt); } List<CentroidCluster<DoublePoint>> clusters = kmeans.cluster(pts); double maxp = 0; double minp = 0; int ct = 0; for (CentroidCluster<DoublePoint> c : clusters) { double mp = 0; double mnp = Double.MAX_VALUE; for (DoublePoint dpt : c.getPoints()) { double[] pt = dpt.getPoint(); mp = Math.max(mp, pt[0]); mnp = Math.min(mnp, pt[0]); } if (mp > maxp) { maxp = mp; minp = mnp; } ct++; } if (ct > 2) { Utilities.er("Can there be > 2 clusters?"); } return minp; }
From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java
/** * Uses an existing index, where each and every document should have a set of local features. A number of * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words * (the cluster means). For all images a histogram on the visual words is created and added to the documents. * Pre-existing histograms are deleted, so this method can be used for re-indexing. * * @throws java.io.IOException//from ww w.j av a2 s.c o m */ public void index() throws IOException { df.setMaximumFractionDigits(3); // find the documents for building the vocabulary: HashSet<Integer> docIDs = selectVocabularyDocs(); System.out.println("Using " + docIDs.size() + " documents to build the vocabulary."); KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15); // fill the KMeans object: LinkedList<DoublePoint> features = new LinkedList<DoublePoint>(); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) { int nextDoc = iterator.next(); if (reader.hasDeletions() && !liveDocs.get(nextDoc)) continue; // if it is deleted, just ignore it. Document d = reader.document(nextDoc); // features.clear(); IndexableField[] fields = d.getFields(localFeatureFieldName); String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]; for (int j = 0; j < fields.length; j++) { LireFeature f = getFeatureInstance(); f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); // copy the data over to new array ... double[] feat = new double[f.getDoubleHistogram().length]; System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length); features.add(new DoublePoint(f.getDoubleHistogram())); } } if (features.size() < numClusters) { // this cannot work. You need more data points than clusters. throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in " + numClusters + ". Try to use less clusters or more images."); } // do the clustering: System.out.println("Number of local features: " + df.format(features.size())); System.out.println("Starting clustering ..."); List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features); // TODO: Serializing clusters to a file on the disk ... System.out.println("Clustering finished, " + clusterList.size() + " clusters found"); clusters = new LinkedList<double[]>(); for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) { CentroidCluster<DoublePoint> centroidCluster = iterator.next(); clusters.add(centroidCluster.getCenter().getPoint()); } System.out.println("Creating histograms ..."); int[] tmpHist = new int[numClusters]; IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d); // careful: copy reader to RAM for faster access when reading ... // reader = IndexReader.open(new RAMDirectory(reader.directory()), true); LireFeature f = getFeatureInstance(); for (int i = 0; i < reader.maxDoc(); i++) { try { if (reader.hasDeletions() && !liveDocs.get(i)) continue; for (int j = 0; j < tmpHist.length; j++) { tmpHist[j] = 0; } Document d = reader.document(i); IndexableField[] fields = d.getFields(localFeatureFieldName); // remove the fields if they are already there ... d.removeField(visualWordsFieldName); d.removeField(localFeatureHistFieldName); // find the appropriate cluster for each feature: for (int j = 0; j < fields.length; j++) { f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset, fields[j].binaryValue().length); tmpHist[clusterForFeature(f, clusters)]++; } // System.out.println(Arrays.toString(tmpHist)); d.add(new StoredField(localFeatureHistFieldName, SerializationUtils.toByteArray(normalize(tmpHist)))); quantize(tmpHist); d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES)); // remove local features to save some space if requested: if (DELETE_LOCAL_FEATURES) { d.removeFields(localFeatureFieldName); } // now write the new one. we use the identifier to update ;) iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d); } catch (IOException e) { e.printStackTrace(); } } iw.commit(); // this one does the "old" commit(), it removes the deleted local features. iw.forceMerge(1); iw.close(); System.out.println("Finished."); }