Example usage for org.apache.mahout.utils.clustering ClusterDumper printClusters

Introduction

In this page you can find the example usage for org.apache.mahout.utils.clustering ClusterDumper printClusters.

Prototype

public void printClusters(String[] dictionary) throws Exception

Source Link

Usage

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * /*from  w  w w. ja va  2  s .c  o  m*/
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//  www.  jav a  2 s.  c  o m
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java

License:Apache License

/**
 * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 parameters. All
 * output data will be written to the output directory, which will be initially deleted if it exists. The clustered
 * points will reside in the path <output>/clustered-points. By default, the job expects the a file containing
 * synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * /*  w w  w  . ja va 2 s .c  o m*/
 * @param input
 *            the String denoting the input directory path
 * @param output
 *            the String denoting the output directory path
 * @param measure
 *            the DistanceMeasure to use
 * @param t1
 *            the canopy T1 threshold
 * @param t2
 *            the canopy T2 threshold
 */
private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2)
        throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, 0.0,
            false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf the Configuration to use//w  ww .  j  a  v  a2s  .  c  om
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param k the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    System.out.println("****************************************************************************");

    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);

    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs,
            new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf);
    IntWritable key = new IntWritable();
    WeightedVectorWritable value = new WeightedVectorWritable();
    while (reader.next(key, value)) {
        System.out.println(value.toString() + " belongs to cluster " + key.toString());
    }
    reader.close();
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given
 * distance measure, t1, t2 and iteration parameters. All output data will
 * be written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".//from   www.ja  v a2  s .  c om
 *
 * @param conf the Configuration to use
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param t1 the canopy T1 threshold
 * @param t2 the canopy T2 threshold
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.ClusterDumpjob.java

License:Apache License

public static void main(String[] args) throws Exception {

    String output = "target/output";

    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));

    clusterDumper.printClusters(null);
    // if(true){throw new RuntimeException("this");}
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
 * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
 * directory named "output"./*  w  w  w.  j  a v  a  2s. c  o m*/
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations,
            true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
 * and writes output to a directory named "output".
 * //from  w w  w. ja v  a2s . c om
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param k                the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 */// ww w. jav a 2  s. c  o  m
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    //        KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
    //                measure, convergenceDelta, maxIterations, true, false);

    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//from w  w  w. j ava  2 s.c om
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param t1               the canopy T1 threshold
 * @param t2               the canopy T2 threshold
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, 0.5, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}