Example usage for org.apache.mahout.clustering ClusteringUtils summarizeClusterDistances

List of usage examples for org.apache.mahout.clustering ClusteringUtils summarizeClusterDistances

Introduction

In this page you can find the example usage for org.apache.mahout.clustering ClusteringUtils summarizeClusterDistances.

Prototype

public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
        Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) 

Source Link

Document

Computes the summaries for the distances in each cluster.

Usage

From source file:org.conan.mymahout.clustering.streaming.tools.ClusterQualitySummarizer.java

License:Apache License

public int run(String[] args) throws IOException {
    if (!parseArgs(args)) {
        return -1;
    }// www  .  j  a  v  a2  s  .c o m

    Configuration conf = new Configuration();
    try {
        //      Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out));

        fileOut = new PrintWriter(new FileOutputStream(outputFile));
        fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,"
                + "distance.q4,count,is.train\n");

        // Reading in the centroids (both pairs, if they exist).
        List<Centroid> centroids;
        List<Centroid> centroidsCompare = null;
        if (mahoutKMeansFormat) {
            SequenceFileDirValueIterable<ClusterWritable> clusterIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable));
        } else {
            SequenceFileDirValueIterable<CentroidWritable> centroidIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                    new Path(centroidFile), PathType.GLOB, conf);
            centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable));
        }

        if (centroidCompareFile != null) {
            if (mahoutKMeansFormatCompare) {
                SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable = new SequenceFileDirValueIterable<ClusterWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists
                        .newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable));
            } else {
                SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable = new SequenceFileDirValueIterable<CentroidWritable>(
                        new Path(centroidCompareFile), PathType.GLOB, conf);
                centroidsCompare = Lists.newArrayList(
                        IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable));
            }
        }

        // Reading in the "training" set.
        SequenceFileDirValueIterable<VectorWritable> trainIterable = new SequenceFileDirValueIterable<VectorWritable>(
                new Path(trainFile), PathType.GLOB, conf);
        Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable);
        Iterable<Vector> datapoints = trainDatapoints;

        printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids,
                new SquaredEuclideanDistanceMeasure()), "train");

        // Also adding in the "test" set.
        if (testFile != null) {
            SequenceFileDirValueIterable<VectorWritable> testIterable = new SequenceFileDirValueIterable<VectorWritable>(
                    new Path(testFile), PathType.GLOB, conf);
            Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable);

            printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids,
                    new SquaredEuclideanDistanceMeasure()), "test");

            datapoints = Iterables.concat(trainDatapoints, testDatapoints);
        }

        // At this point, all train/test CSVs have been written. We now compute quality metrics.
        List<OnlineSummarizer> summaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroids,
                distanceMeasure);
        List<OnlineSummarizer> compareSummaries = null;
        if (centroidsCompare != null) {
            compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare,
                    distanceMeasure);
        }
        System.out.printf("[Dunn Index] First: %f",
                ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
        System.out.printf("[Davies-Bouldin Index] First: %f",
                ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries));
        if (compareSummaries != null) {
            System.out.printf(" Second: %f\n",
                    ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries));
        } else {
            System.out.printf("\n");
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        Closeables.close(fileOut, false);
    }
    return 0;
}