Example usage for org.apache.commons.math3.ml.clustering KMeansPlusPlusClusterer cluster

Introduction

In this page you can find the example usage for org.apache.commons.math3.ml.clustering KMeansPlusPlusClusterer cluster.

Prototype

@Override
public List<CentroidCluster<T>> cluster(final Collection<T> points)
        throws MathIllegalArgumentException, ConvergenceException

Source Link

Document

Runs the K-means++ clustering algorithm.

Usage

From source file:clustering.KMeans.java

public static void main(String[] args) throws UnknownHostException {
    if (args.length != 1) {
        System.out.println("Usage : KMeans <nrClusters>");
        System.exit(-1);//from w  ww.ja  v a 2  s  .c  om
    }

    int kClusters = Integer.parseInt(args[0]);

    ArrayList<Artist> artists = new ArrayList<Artist>();
    DBHelper dbHelper = DBHelper.getInstance();
    DBCursor result = dbHelper.findArtistsWithFBandTW();

    while (result.hasNext()) {
        DBObject currentArtist = result.next();
        artists.add(Artist.fromDBObject(currentArtist));
    }

    //System.out.println(artists.size());
    KMeansPlusPlusClusterer<Artist> clusterer = new KMeansPlusPlusClusterer<Artist>(kClusters);
    List<CentroidCluster<Artist>> clusters = clusterer.cluster(artists);
    //System.out.println(clusters.size());
    dbHelper.emptyClusterCenters();

    for (CentroidCluster<Artist> cluster : clusters) {
        double[] center = cluster.getCenter().getPoint();
        ObjectId centerId = dbHelper.insertClusterCenter(center[0], center[1], center[2]);

        List<Artist> artC = cluster.getPoints();
        for (Artist artist : artC) {
            dbHelper.updateMatrixRowCluster(artist.getDBObject(), centerId);
            //System.out.print("("+artist.fb_likes+","+artist.twitter_followers+","+artist.album_count+") ");
        }
    }
}

From source file:edu.byu.nlp.data.app.AnnotationStream2Annotators.java

/**
 * This returns a set of clustered annotator parameters. Averaging them yields the centroid of the cluster.
 * Note that Annotator.clusterAssignment properties are change IN PLACE.  
 */// w w  w. j  a v a  2 s  . c  om
private static void assignKMeansClusters(List<Annotator> annotators, int k, int maxIterations,
        RandomGenerator rnd) {
    Preconditions.checkNotNull(annotators);
    Preconditions.checkArgument(annotators.size() > 0);
    KMeansPlusPlusClusterer<Annotator> clusterer = new KMeansPlusPlusClusterer<>(k, maxIterations,
            new EuclideanDistance(), rnd);
    List<CentroidCluster<Annotator>> clusterCentroids = clusterer.cluster(annotators);

    for (int c = 0; c < clusterCentroids.size(); c++) {
        for (Annotator annotator : clusterCentroids.get(c).getPoints()) {
            // note: we don't return the centroid point here because averaging the points in the cluster 
            // yields precisely the centroid point.
            // stick this annotator in this location in the confusions
            annotator.clusterAssignment = c;
        }
    }

}

From source file:bigdataproject.KMeansKFinder.java

public int find(double epsilon) {
    double oldAvDist = 0.0;
    for (int k = 2; k < numSamples; k++) {
        KMeansPlusPlusClusterer kmeans = new KMeansPlusPlusClusterer(k, 1000, new EuclideanDistance());
        List<Cluster<DoublePoint>> clusterList = kmeans.cluster(list);
        double[] avDistances = new double[k];
        int index = 0;
        for (Cluster<DoublePoint> c : clusterList) {
            List cluster = c.getPoints();
            int size = cluster.size();
            double[] centroid = getCentroid(cluster);
            double distanceSum = 0.0;
            for (Object p : cluster) {
                DoublePoint point = (DoublePoint) p;
                double[] pointDouble = point.getPoint();
                EuclideanDistance dist = new EuclideanDistance();
                distanceSum += dist.compute(centroid, pointDouble);
            }//from w  w  w  .ja v a2  s. com
            avDistances[index] = distanceSum / size;
            index++;
        }
        double avDistSum = 0.0;
        for (int i = 0; i < avDistances.length; i++) {
            avDistSum += avDistances[i];
        }
        double newAvDist = avDistSum / avDistances.length;
        double difference = Math.abs(newAvDist - oldAvDist);
        if (difference >= epsilon) {
            oldAvDist = newAvDist;
        } else
            return k - 1;
    }
    return 0;
}

From source file:indexer.DocClusterer.java

public List<CentroidCluster<WordVec>> clusterWords(HashMap<String, WordVec> wvecMap, int numClusters)
        throws Exception {
    System.out.println("Clustering document: " + dvec.getDocId());
    List<WordVec> wordList = new ArrayList<>(wvecMap.size());
    for (Map.Entry<String, WordVec> e : wvecMap.entrySet()) {
        wordList.add(e.getValue());/*w  ww  .j a  v a 2s.  co m*/
    }

    KMeansPlusPlusClusterer<WordVec> clusterer = new KMeansPlusPlusClusterer<>(
            Math.min(numClusters, wordList.size()));
    if (wordList.size() == 0)
        return null;
    List<CentroidCluster<WordVec>> clusters = clusterer.cluster(wordList);
    return clusters;
}

From source file:msi.gaml.operators.Stats.java

@operator(value = "kmeans", can_be_const = false, type = IType.LIST, category = {
        IOperatorCategory.STATISTICAL }, concept = { IConcept.STATISTIC, IConcept.CLUSTERING })
@doc(value = "returns the list of clusters (list of instance indices) computed with the kmeans++ algorithm from the first operand data according to the number of clusters to split the data into (k) and the maximum number of iterations to run the algorithm for (If negative, no maximum will be used) (maxIt). Usage: kmeans(data,k,maxit)", special_cases = "if the lengths of two vectors in the right-hand aren't equal, returns 0", examples = {
        @example(value = "kmeans ([[2,4,5], [3,8,2], [1,1,3], [4,3,4]],2,10)", isExecutable = false) })
public static GamaList<GamaList> KMeansPlusplusApache(final IScope scope, final GamaList data, final Integer k,
        final Integer maxIt) throws GamaRuntimeException {
    final MersenneTwister rand = new MersenneTwister(scope.getRandom().getSeed().longValue());

    final List<DoublePoint> instances = new ArrayList<>();
    for (int i = 0; i < data.size(); i++) {
        final GamaList d = (GamaList) data.get(i);
        final double point[] = new double[d.size()];
        for (int j = 0; j < d.size(); j++) {
            point[j] = Cast.asFloat(scope, d.get(j));
        }//  ww  w  . j  a  va 2 s.c om
        instances.add(new Instance(i, point));
    }
    final KMeansPlusPlusClusterer<DoublePoint> kmeans = new KMeansPlusPlusClusterer<>(k, maxIt,
            new EuclideanDistance(), rand);
    final List<CentroidCluster<DoublePoint>> clusters = kmeans.cluster(instances);
    final GamaList results = (GamaList) GamaListFactory.create();
    for (final Cluster<DoublePoint> cl : clusters) {
        final GamaList clG = (GamaList) GamaListFactory.create();
        for (final DoublePoint pt : cl.getPoints()) {
            clG.addValue(scope, ((Instance) pt).getId());
        }
        results.addValue(scope, clG);
    }
    return results;
}

From source file:gedi.atac.Atac.java

public static void testInPeaks(GenomicRegionStorage<? extends AlignedReadsData> storage, String contrasts,
        String peakFile, String rmq, String compOut, String bicOut, String out, boolean randomizeContrasts)
        throws IOException {

    DiskGenomicNumericBuilder clusterRmq = new DiskGenomicNumericBuilder(rmq);
    LineIterator it = new LineOrientedFile(peakFile).lineIterator();
    LineOrientedFile o = new LineOrientedFile(out);
    o.startWriting();//  w  w  w.jav  a2s.c  o m
    o.writef("%s\tComponents\tp.value\n", it.next());
    int offset = 4;

    ContrastMapping contr = new ContrastMapping();
    ExtendedIterator<String> coit = new LineOrientedFile(contrasts).lineIterator();
    if (randomizeContrasts) {
        String[] ca = coit.toArray(new String[0]);
        ArrayUtils.shuffleSlice(ca, 0, ca.length);
        coit = FunctorUtils.arrayIterator(ca);
    }
    coit.forEachRemaining(
            l -> contr.addMapping(contr.getNumOriginalConditions(), contr.getMappedIndexOrNext(l), l));

    LineOrientedFile co = new LineOrientedFile(compOut);
    co.startWriting();
    co.writef("Peak\tComponent");
    for (int i = 0; i < contr.getNumMergedConditions(); i++)
        co.writef("\t%s", contr.getMappedName(i));
    co.writeLine();

    LineOrientedFile bico = new LineOrientedFile(bicOut);
    bico.startWriting();
    bico.writef("Peak\tk\tBIC\n");

    Progress pr = new ConsoleProgress();
    pr.init();
    int peakCount = (int) new LineOrientedFile(peakFile).lineIterator().count() - 1;
    pr.setCount(peakCount);

    while (it.hasNext()) {
        String line = it.next();
        ImmutableReferenceGenomicRegion<Object> peak = ImmutableReferenceGenomicRegion
                .parse(StringUtils.splitField(line, '\t', 0));

        pr.setDescription(peak.toString());
        pr.incrementProgress();

        HashMap<FixedDoublePoint, Integer> pToPos = new HashMap<FixedDoublePoint, Integer>();
        FixedDoublePoint[] m = new FixedDoublePoint[peak.getRegion().getTotalLength()];
        for (int i = 0; i < m.length; i++) {
            m[i] = new FixedDoublePoint(new double[contr.getNumMergedConditions()]);
            pToPos.put(m[i], peak.getRegion().map(i));
        }

        Consumer<MutableReferenceGenomicRegion<? extends AlignedReadsData>> adder = new Consumer<MutableReferenceGenomicRegion<? extends AlignedReadsData>>() {

            @Override
            public void accept(MutableReferenceGenomicRegion<? extends AlignedReadsData> mrgr) {
                try {

                    int start = GenomicRegionPosition.Start.position(mrgr.getReference(), mrgr.getRegion(),
                            offset);
                    if (peak.getRegion().contains(start))
                        addDownsampled(contr, m[peak.getRegion().induce(start)].getPoint(),
                                mrgr.getData().getTotalCountsForConditions(ReadCountMode.All));

                    int stop = GenomicRegionPosition.Stop.position(mrgr.getReference(), mrgr.getRegion(),
                            -offset);
                    if (peak.getRegion().contains(stop))
                        addDownsampled(contr, m[peak.getRegion().induce(stop)].getPoint(),
                                mrgr.getData().getTotalCountsForConditions(ReadCountMode.All));
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }

            private void addDownsampled(ContrastMapping contr, double[] re, double[] c) {
                double max = ArrayUtils.max(c);
                if (max > 0)
                    ArrayUtils.mult(c, 1 / max);
                for (int i = 0; i < c.length; i++)
                    if (contr.getMappedIndex(i) > -1)
                        re[contr.getMappedIndex(i)] += c[i];
            }
        };
        storage.iterateIntersectingMutableReferenceGenomicRegions(peak.getReference().toPlusStrand(),
                peak.getRegion()).forEachRemaining(adder);
        storage.iterateIntersectingMutableReferenceGenomicRegions(peak.getReference().toMinusStrand(),
                peak.getRegion()).forEachRemaining(adder);

        //         double[] total = new double[cond];
        //         for (int i=0; i<m.length; i++) 
        //            for (int j=0; j<cond; j++)
        //               total[j]+=m[i].getPoint()[j];
        //         ArrayUtils.normalize(total);
        //         
        //         double ll = 0;
        //         for (int i=0; i<m.length; i++)
        //            ll+=ddirichlet1(m[i].getPoint(), total);
        //         

        DoubleArrayList ll = new DoubleArrayList();
        ll.add(0);
        DoubleArrayList bic = new DoubleArrayList();
        bic.add(0);

        ArrayList<FixedDoublePoint> list = new ArrayList<FixedDoublePoint>();
        for (FixedDoublePoint p : m)
            if (ArrayUtils.sum(p.getPoint()) > 0)
                list.add(p);

        List<CentroidCluster<FixedDoublePoint>> ocl = null;
        double op = 0;

        for (int k = 1; k < Math.min(list.size(), 50); k++) {

            KMeansPlusPlusClusterer<FixedDoublePoint> kmeans = new KMeansPlusPlusClusterer<FixedDoublePoint>(k);
            List<CentroidCluster<FixedDoublePoint>> cl = kmeans.cluster(list);

            double cll = 0;
            for (CentroidCluster<FixedDoublePoint> c : cl) {
                double[] total = new double[contr.getNumMergedConditions()];
                Arrays.fill(total, 1);
                for (FixedDoublePoint p : c.getPoints())
                    for (int j = 0; j < contr.getNumMergedConditions(); j++)
                        total[j] += p.getPoint()[j];
                ArrayUtils.normalize(total);

                for (FixedDoublePoint p : c.getPoints())
                    cll += ddirichlet1(p.getPoint(), total);
            }

            // LLR test
            double LLR = 2 * cll - 2 * ll.getLastDouble();
            double p = 1
                    - new ChiSquaredDistribution(contr.getNumMergedConditions() - 1).cumulativeProbability(LLR);

            bic.add(-2 * cll + 2 * (contr.getNumMergedConditions() - 1) * k);
            bico.writef("%s\t%d\t%.1f\n", peak.toLocationString(), k, bic.getLastDouble());

            // bonferroni correction
            p = p * peakCount;

            if (p > 0.01) {
                if (ocl.size() > 1) {
                    for (int i = 0; i < ocl.size(); i++) {
                        co.writef("%s\t%d", peak.toLocationString(), i);
                        double[] total = new double[contr.getNumMergedConditions()];
                        Arrays.fill(total, 1);
                        for (FixedDoublePoint pp : ocl.get(i).getPoints()) {
                            clusterRmq.addValue(peak.getReference(), pToPos.get(pp).intValue(), (byte) i);
                            for (int j = 0; j < contr.getNumMergedConditions(); j++)
                                total[j] += pp.getPoint()[j];
                        }
                        ArrayUtils.normalize(total);
                        for (int c = 0; c < contr.getNumMergedConditions(); c++)
                            co.writef("\t%.4f", total[c]);
                        co.writeLine();

                    }
                }
                break;
            }

            ll.add(cll);
            ocl = cl;
            op = p;
        }

        o.writef("%s\t%d\t%.4g\n", line, ll.size() - 1, ll.size() == 2 ? Double.NaN : op);
    }

    pr.finish();
    o.finishWriting();
    co.finishWriting();

    clusterRmq.build();
}

From source file:bigdataproject.MainJFrame.java

private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton1ActionPerformed
    this.jLabel8.setText("");
    ReadDataSet read = new ReadDataSet();
    read.readFromFile();//from w w w  .j  ava  2 s. c om
    read.filter();
    matrix = read.getMatrix();
    PCA pca = new PCA(matrix);
    double[][] matrix2DPCA = pca.reduceDimensions();
    BlockRealMatrix pcaMatrix = new BlockRealMatrix(matrix2DPCA);
    BlockRealMatrix pcaMatrixTranspose = pcaMatrix.transpose();
    List<DoublePoint> list = read.getCollection(read.getHashMap(pcaMatrixTranspose.getData()));
    List<Cluster<DoublePoint>> clusterList;
    if (kMeans) {
        int k;
        if (this.jCheckBox1.isSelected()) {
            KMeansKFinder kFinder = new KMeansKFinder(list);
            k = kFinder.find(0.15);
        } else
            k = (int) this.jSpinner1.getValue();
        KMeansPlusPlusClusterer kmeans = new KMeansPlusPlusClusterer(k, 1000, new EuclideanDistance());
        clusterList = kmeans.cluster(list);
    } else {
        int minPts;
        double eps;
        if (this.jCheckBox2.isSelected()) {
            minPts = 6;
            //KDistances dist = new KDistances(pcaMatrixTranspose.getData());
            //dist.calculateDistances();
            //dist.getKSortedNearestNeighbors(minPts);
            //dist.printKdistances();
            eps = 1.0;
        } else {
            minPts = (int) this.jSpinner2.getValue();
            try {
                eps = Double.parseDouble(this.jTextField1.getText());
            } catch (NumberFormatException e) {
                this.jLabel8.setText("Wrong eps Value");
                return;
            }
        }
        DBSCANClusterer dbscan = new DBSCANClusterer(eps, minPts);
        clusterList = dbscan.cluster(list);
    }
    final ScatterPlot demo = new ScatterPlot("Big Data Clustering Project", matrix2DPCA, clusterList);
    demo.pack();
    RefineryUtilities.centerFrameOnScreen(demo);
    demo.setVisible(true);
}

From source file:ec.coevolve.MultiPopCoevolutionaryEvaluatorExtra.java

protected Individual[] behaviourElite(EvolutionState state, int subpop) {
    // Generate the dataset
    ArrayList<IndividualClusterable> points = new ArrayList<IndividualClusterable>();
    if (novelChampionsOrigin == NovelChampionsOrigin.halloffame) {
        for (int i = 0; i < hallOfFame[subpop].size(); i++) {
            points.add(new IndividualClusterable(hallOfFame[subpop].get(i), i));
        }//  w  w  w .ja v  a2  s  . c  o m
    } else if (novelChampionsOrigin == NovelChampionsOrigin.archive) {
        for (ArchiveEntry ae : archives[subpop]) {
            points.add(new IndividualClusterable(ae.getIndividual(), ae.getGeneration()));
        }
    }

    // Cap -- only use the individuals with the highest fitness scores
    if (novelChampionsCap > 0) {
        // calculate the percentile
        DescriptiveStatistics ds = new DescriptiveStatistics();
        for (IndividualClusterable ic : points) {
            ds.addValue(ic.getFitness());
        }
        double percentile = ds.getPercentile(novelChampionsCap);

        // remove those below the percentile
        Iterator<IndividualClusterable> iter = points.iterator();
        while (iter.hasNext()) {
            IndividualClusterable next = iter.next();
            if (next.getFitness() < percentile) {
                iter.remove();
            }
        }
    }

    // Check if there are enough points for clustering
    if (points.size() <= novelChampions) {
        Individual[] elite = new Individual[points.size()];
        for (int i = 0; i < elite.length; i++) {
            elite[i] = points.get(i).getIndividual();
        }
        return elite;
    }

    // Do the k-means clustering
    KMeansPlusPlusClusterer<IndividualClusterable> clusterer = new KMeansPlusPlusClusterer<IndividualClusterable>(
            novelChampions, 100);
    List<CentroidCluster<IndividualClusterable>> clusters = clusterer.cluster(points);

    // Return one from each cluster
    Individual[] elite = new Individual[novelChampions];
    for (int i = 0; i < clusters.size(); i++) {
        CentroidCluster<IndividualClusterable> cluster = clusters.get(i);
        List<IndividualClusterable> clusterPoints = cluster.getPoints();
        if (novelChampionsMode == NovelChampionsMode.random) {
            int randIndex = state.random[0].nextInt(clusterPoints.size());
            elite[i] = clusterPoints.get(randIndex).getIndividual();
        } else if (novelChampionsMode == NovelChampionsMode.last) {
            IndividualClusterable oldest = null;
            for (IndividualClusterable ic : clusterPoints) {
                if (oldest == null || ic.age > oldest.age) {
                    oldest = ic;
                }
            }
            elite[i] = oldest.getIndividual();
        } else if (novelChampionsMode == NovelChampionsMode.centroid) {
            DistanceMeasure dm = clusterer.getDistanceMeasure();
            double[] centroid = cluster.getCenter().getPoint();
            IndividualClusterable closest = null;
            double closestDist = Double.MAX_VALUE;
            for (IndividualClusterable ic : clusterPoints) {
                double dist = dm.compute(centroid, ic.getPoint());
                if (dist < closestDist) {
                    closestDist = dist;
                    closest = ic;
                }
            }
            elite[i] = closest.getIndividual();
        } else if (novelChampionsMode == NovelChampionsMode.best) {
            IndividualClusterable best = null;
            float highestFit = Float.NEGATIVE_INFINITY;
            for (IndividualClusterable ic : clusterPoints) {
                if (ic.getFitness() > highestFit) {
                    best = ic;
                    highestFit = ic.getFitness();
                }
            }
            elite[i] = best.getIndividual();
        }
    }
    return elite;
}

From source file:edu.nyu.vida.data_polygamy.ctdata.TopologicalIndex.java

public double getThreshold(Feature[] f) {

    KMeansPlusPlusClusterer<DoublePoint> kmeans = new KMeansPlusPlusClusterer<DoublePoint>(2, 1000);
    ArrayList<DoublePoint> pts = new ArrayList<DoublePoint>();

    if (f.length < 2) {
        return f[0].wt * 0.4;
    }/*  www .j a v a  2  s  .  c o  m*/
    for (int i = 0; i < f.length; i++) {
        DoublePoint dpt = new DoublePoint(new double[] { f[i].wt });
        pts.add(dpt);
    }
    List<CentroidCluster<DoublePoint>> clusters = kmeans.cluster(pts);

    double maxp = 0;
    double minp = 0;
    int ct = 0;
    for (CentroidCluster<DoublePoint> c : clusters) {
        double mp = 0;
        double mnp = Double.MAX_VALUE;
        for (DoublePoint dpt : c.getPoints()) {
            double[] pt = dpt.getPoint();
            mp = Math.max(mp, pt[0]);
            mnp = Math.min(mnp, pt[0]);
        }
        if (mp > maxp) {
            maxp = mp;
            minp = mnp;
        }
        ct++;
    }
    if (ct > 2) {
        Utilities.er("Can there be > 2 clusters?");
    }
    return minp;
}

From source file:net.semanticmetadata.lire.imageanalysis.bovw.LocalFeatureHistogramBuilderKmeansPlusPlus.java

/**
 * Uses an existing index, where each and every document should have a set of local features. A number of
 * random images (numDocsForVocabulary) is selected and clustered to get a vocabulary of visual words
 * (the cluster means). For all images a histogram on the visual words is created and added to the documents.
 * Pre-existing histograms are deleted, so this method can be used for re-indexing.
 *
 * @throws java.io.IOException//from ww  w.j av a2  s.c  o  m
 */
public void index() throws IOException {
    df.setMaximumFractionDigits(3);
    // find the documents for building the vocabulary:
    HashSet<Integer> docIDs = selectVocabularyDocs();
    System.out.println("Using " + docIDs.size() + " documents to build the vocabulary.");
    KMeansPlusPlusClusterer kpp = new KMeansPlusPlusClusterer(numClusters, 15);
    // fill the KMeans object:
    LinkedList<DoublePoint> features = new LinkedList<DoublePoint>();
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);
    for (Iterator<Integer> iterator = docIDs.iterator(); iterator.hasNext();) {
        int nextDoc = iterator.next();
        if (reader.hasDeletions() && !liveDocs.get(nextDoc))
            continue; // if it is deleted, just ignore it.
        Document d = reader.document(nextDoc);
        //            features.clear();
        IndexableField[] fields = d.getFields(localFeatureFieldName);
        String file = d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0];
        for (int j = 0; j < fields.length; j++) {
            LireFeature f = getFeatureInstance();
            f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                    fields[j].binaryValue().length);
            // copy the data over to new array ...
            double[] feat = new double[f.getDoubleHistogram().length];
            System.arraycopy(f.getDoubleHistogram(), 0, feat, 0, feat.length);
            features.add(new DoublePoint(f.getDoubleHistogram()));
        }
    }
    if (features.size() < numClusters) {
        // this cannot work. You need more data points than clusters.
        throw new UnsupportedOperationException("Only " + features.size() + " features found to cluster in "
                + numClusters + ". Try to use less clusters or more images.");
    }
    // do the clustering:
    System.out.println("Number of local features: " + df.format(features.size()));
    System.out.println("Starting clustering ...");
    List<CentroidCluster<DoublePoint>> clusterList = kpp.cluster(features);
    // TODO: Serializing clusters to a file on the disk ...
    System.out.println("Clustering finished, " + clusterList.size() + " clusters found");
    clusters = new LinkedList<double[]>();
    for (Iterator<CentroidCluster<DoublePoint>> iterator = clusterList.iterator(); iterator.hasNext();) {
        CentroidCluster<DoublePoint> centroidCluster = iterator.next();
        clusters.add(centroidCluster.getCenter().getPoint());
    }
    System.out.println("Creating histograms ...");
    int[] tmpHist = new int[numClusters];
    IndexWriter iw = LuceneUtils.createIndexWriter(((DirectoryReader) reader).directory(), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer, 256d);

    // careful: copy reader to RAM for faster access when reading ...
    //        reader = IndexReader.open(new RAMDirectory(reader.directory()), true);
    LireFeature f = getFeatureInstance();
    for (int i = 0; i < reader.maxDoc(); i++) {
        try {
            if (reader.hasDeletions() && !liveDocs.get(i))
                continue;
            for (int j = 0; j < tmpHist.length; j++) {
                tmpHist[j] = 0;
            }
            Document d = reader.document(i);
            IndexableField[] fields = d.getFields(localFeatureFieldName);
            // remove the fields if they are already there ...
            d.removeField(visualWordsFieldName);
            d.removeField(localFeatureHistFieldName);

            // find the appropriate cluster for each feature:
            for (int j = 0; j < fields.length; j++) {
                f.setByteArrayRepresentation(fields[j].binaryValue().bytes, fields[j].binaryValue().offset,
                        fields[j].binaryValue().length);
                tmpHist[clusterForFeature(f, clusters)]++;
            }
            //                System.out.println(Arrays.toString(tmpHist));
            d.add(new StoredField(localFeatureHistFieldName,
                    SerializationUtils.toByteArray(normalize(tmpHist))));
            quantize(tmpHist);
            d.add(new TextField(visualWordsFieldName, arrayToVisualWordString(tmpHist), Field.Store.YES));

            // remove local features to save some space if requested:
            if (DELETE_LOCAL_FEATURES) {
                d.removeFields(localFeatureFieldName);
            }
            // now write the new one. we use the identifier to update ;)
            iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER,
                    d.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), d);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    iw.commit();
    // this one does the "old" commit(), it removes the deleted local features.
    iw.forceMerge(1);
    iw.close();
    System.out.println("Finished.");
}