Example usage for org.apache.mahout.clustering ClusteringUtils daviesBouldinIndex

List of usage examples for org.apache.mahout.clustering ClusteringUtils daviesBouldinIndex

Introduction

In this page you can find the example usage for org.apache.mahout.clustering ClusteringUtils daviesBouldinIndex.

Prototype

public static double daviesBouldinIndex(List<? extends Vector> centroids, DistanceMeasure distanceMeasure,
        List<OnlineSummarizer> clusterDistanceSummaries) 

Source Link

Document

Computes the Davies-Bouldin Index for a given clustering.

Usage

From source file:org.conan.mymahout.clustering.streaming.tools.ClusterQualitySummarizer.java

License:Apache License

public int run(String[] args) throws IOException {
    if (!parseArgs(args)) {
        return -1;
    }/*from   www  .  j  a  v a  2s. c  o m*/

    Configuration conf = new Configuration();
    try {
        //      Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out));

        fileOut = new PrintWriter(new FileOutputStream(outputFile));
        fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
                + "distance.q4,count,is.train\n");

        // Reading in the centroids (both pairs, if they exist).
        List<Centroid> centroids;
        List<Centroid> centroidsCompare = null;
        if (mahoutKMeansFormat) {
            SequenceFileDirValueIterable<ClusterWritable> clusterIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
        } else {
            SequenceFileDirValueIterable<CentroidWritable> centroidIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
        }

        if (centroidCompareFile != null) {
            if (mahoutKMeansFormatCompare) {
                SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists
                        .newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
            } else {
                SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists.newArrayList(
                        IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
            }
        }

        // Reading in the "training" set.
        SequenceFileDirValueIterable<VectorWritable> trainIterable = new SequenceFileDirValueIterable<VectorWritable>(
                new Path(trainFile), PathType.GLOB, conf);
        Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
        Iterable<Vector> datapoints = trainDatapoints;

        printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
                new SquaredEuclideanDistanceMeasure()), "train");

        // Also adding in the "test" set.
        if (testFile != null) {
            SequenceFileDirValueIterable<VectorWritable> testIterable = new SequenceFileDirValueIterable<VectorWritable>(
                    new Path(testFile), PathType.GLOB, conf);
            Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);

            printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
                    new SquaredEuclideanDistanceMeasure()), "test");

            datapoints = Iterables.concat(trainDatapoints, testDatapoints);
        }

        // At this point, all train/test CSVs have been written. We now compute quality metrics.
        List<OnlineSummarizer> summaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroids,
                distanceMeasure);
        List<OnlineSummarizer> compareSummaries = null;
        if (centroidsCompare != null) {
            compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare,
                    distanceMeasure);
        }
        System.out.printf("[Dunn Index] First: %f",
                ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
        System.out.printf("[Davies-Bouldin Index] First: %f",
                ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        Closeables.close(fileOut, false);
    }
    return 0;
}