List of usage examples for org.apache.mahout.clustering Cluster INITIAL_CLUSTERS_DIR
String INITIAL_CLUSTERS_DIR
To view the source code for org.apache.mahout.clustering Cluster INITIAL_CLUSTERS_DIR.
Click Source Link
From source file:DisplayKMeans.java
License:Apache License
private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output, DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException { Collection<Vector> points = Lists.newArrayList(); for (int i = 0; i < numClusters; i++) { points.add(SAMPLE_DATA.get(i).get()); // System.out.println(SAMPLE_DATA.get(i).toString()); }//from w w w. j a v a 2 s. c o m List<Cluster> initialClusters = Lists.newArrayList(); int id = 0; for (Vector point : points) { initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure)); } ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta)); Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); prior.writeToSeqFiles(priorPath); ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations); loadClustersWritable(output); }
From source file:chapter5.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * //from ww w. j a v a 2s. c o m * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param k * the number of clusters in Kmeans * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:chapter5.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance * measure, t1, t2 and iteration parameters. All output data will be written * to the output directory, which will be initially deleted if it exists. The * clustered points will reside in the path <output>/clustered-points. By * default, the job expects the a file containing synthetic_control.data as * obtained from/*from www .ja v a 2 s . c om*/ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output". * * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, maxIterations, true, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given * distance measure, t1, t2 and iteration parameters. All output data will * be written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output".// w w w . ja va2 s .c om * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, measure, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.nm.documentClustering.example.KMeansJob.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata", * and writes output to a directory named "output". * /*from ww w . java 2s . c o m*/ * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:curation.mahout_test.DisplayKMeans.java
License:Apache License
private static void runSequentialKMeansClassifier(Configuration conf, Path samples, Path output, DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException { Collection<Vector> points = Lists.newArrayList(); for (int i = 0; i < numClusters; i++) { points.add(SAMPLE_DATA.get(i).get()); }//from w ww . ja v a 2 s . com List<Cluster> initialClusters = Lists.newArrayList(); int id = 0; for (Vector point : points) { initialClusters.add(new org.apache.mahout.clustering.kmeans.Kluster(point, id++, measure)); } ClusterClassifier prior = new ClusterClassifier(initialClusters, new KMeansClusteringPolicy(convergenceDelta)); Path priorPath = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); prior.writeToSeqFiles(priorPath); ClusterIterator.iterateSeq(conf, samples, priorPath, output, maxIterations); loadClustersWritable(output); }
From source file:hbaseworkshop.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param k the number of clusters in Kmeans * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations *//*from w ww.j a v a 2s. c o m*/ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); // KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, // measure, convergenceDelta, maxIterations, true, false); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, 0.5, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:hbaseworkshop.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance * measure, t1, t2 and iteration parameters. All output data will be written * to the output directory, which will be initially deleted if it exists. The * clustered points will reside in the path <output>/clustered-points. By * default, the job expects the a file containing synthetic_control.data as * obtained from//from w w w .j av a 2 s . co m * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output". * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, 0.5, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, maxIterations, true, 0.5, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }