Example usage for org.apache.mahout.utils.clustering ClusterDumper printClusters

List of usage examples for org.apache.mahout.utils.clustering ClusterDumper printClusters

Introduction

In this page you can find the example usage for org.apache.mahout.utils.clustering ClusterDumper printClusters.

Prototype

public void printClusters(String[] dictionary) throws Exception 

Source Link

Usage

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * /*from  w  w w. ja va  2  s .c  o  m*/
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//  www.  jav a  2 s.  c  o m
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java

License:Apache License

/**
 * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 parameters. All
 * output data will be written to the output directory, which will be initially deleted if it exists. The clustered
 * points will reside in the path <output>/clustered-points. By default, the job expects the a file containing
 * synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * /*  w w  w  . ja va 2 s .c  o m*/
 * @param input
 *            the String denoting the input directory path
 * @param output
 *            the String denoting the output directory path
 * @param measure
 *            the DistanceMeasure to use
 * @param t1
 *            the canopy T1 threshold
 * @param t2
 *            the canopy T2 threshold
 */
private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2)
        throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, 0.0,
            false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf the Configuration to use//w  ww .  j  a  v  a2s  .  c  om
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param k the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    System.out.println("****************************************************************************");

    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);

    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs,
            new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf);
    IntWritable key = new IntWritable();
    WeightedVectorWritable value = new WeightedVectorWritable();
    while (reader.next(key, value)) {
        System.out.println(value.toString() + " belongs to cluster " + key.toString());
    }
    reader.close();
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given
 * distance measure, t1, t2 and iteration parameters. All output data will
 * be written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".//from   www.ja  v a2  s .  c om
 *
 * @param conf the Configuration to use
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param t1 the canopy T1 threshold
 * @param t2 the canopy T2 threshold
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.ClusterDumpjob.java

License:Apache License

public static void main(String[] args) throws Exception {

    String output = "target/output";

    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));

    clusterDumper.printClusters(null);
    // if(true){throw new RuntimeException("this");}
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
 * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
 * directory named "output"./*  w  w  w.  j  a v  a  2s. c  o m*/
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations,
            true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
 * and writes output to a directory named "output".
 * //from  w w  w. ja v  a2s . c om
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param k                the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 */// ww w. jav a 2  s. c  o  m
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    //        KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
    //                measure, convergenceDelta, maxIterations, true, false);

    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//from w  w  w. j ava  2 s.c om
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param t1               the canopy T1 threshold
 * @param t2               the canopy T2 threshold
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, 0.5, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}