Example usage for org.apache.mahout.clustering Cluster INITIAL_CLUSTERS_DIR

List of usage examples for org.apache.mahout.clustering Cluster INITIAL_CLUSTERS_DIR

Introduction

In this page you can find the example usage for org.apache.mahout.clustering Cluster INITIAL_CLUSTERS_DIR.

Prototype

String INITIAL_CLUSTERS_DIR

To view the source code for org.apache.mahout.clustering Cluster INITIAL_CLUSTERS_DIR.

Click Source Link

Usage

From source file:DisplayKMeans.java

License:Apache License

private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
        throws IOException {
    Collection<Vector> points = Lists.newArrayList();
    for (int i = 0; i < numClusters; i++) {
        points.add(SAMPLE_DATA.get(i).get());
        //      System.out.println(SAMPLE_DATA.get(i).toString());
    }//from w  w w.  j a  v  a  2  s.  c  o  m
    List<Cluster> initialClusters = Lists.newArrayList();
    int id = 0;
    for (Vector point : points) {
        initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
    }
    ClusterClassifier prior = new ClusterClassifier(initialClusters,
            new KMeansClusteringPolicy(convergenceDelta));
    Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    prior.writeToSeqFiles(priorPath);

    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
    loadClustersWritable(output);
}

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * //from ww w. j a v a 2s.  c  o  m
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from/*from  www .ja  v a  2  s . c  om*/
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given
 * distance measure, t1, t2 and iteration parameters. All output data will
 * be written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".//  w  w w .  ja  va2 s .c  om
 *
 * @param conf the Configuration to use
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param t1 the canopy T1 threshold
 * @param t2 the canopy T2 threshold
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
 * and writes output to a directory named "output".
 * /*from   ww w  . java  2s .  c  o  m*/
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:curation.mahout_test.DisplayKMeans.java

License:Apache License

private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
        throws IOException {
    Collection<Vector> points = Lists.newArrayList();
    for (int i = 0; i < numClusters; i++) {
        points.add(SAMPLE_DATA.get(i).get());
    }//from   w  ww  .  ja  v  a  2  s .  com
    List<Cluster> initialClusters = Lists.newArrayList();
    int id = 0;
    for (Vector point : points) {
        initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure));
    }
    ClusterClassifier prior = new ClusterClassifier(initialClusters,
            new KMeansClusteringPolicy(convergenceDelta));
    Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    prior.writeToSeqFiles(priorPath);

    ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations);
    loadClustersWritable(output);
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param k                the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 *//*from   w  ww.j  a  v a  2s.  c o m*/
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    //        KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
    //                measure, convergenceDelta, maxIterations, true, false);

    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//from w w  w  .j av a 2 s  . co m
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param t1               the canopy T1 threshold
 * @param t2               the canopy T2 threshold
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, 0.5, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}