List of usage examples for org.apache.mahout.utils.clustering ClusterDumper printClusters
public void printClusters(String[] dictionary) throws Exception
From source file:chapter5.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * /*from w w w. ja va 2 s .c o m*/ * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param k * the number of clusters in Kmeans * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:chapter5.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance * measure, t1, t2 and iteration parameters. All output data will be written * to the output directory, which will be initially deleted if it exists. The * clustered points will reside in the path <output>/clustered-points. By * default, the job expects the a file containing synthetic_control.data as * obtained from// www. jav a 2 s. c o m * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output". * * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, maxIterations, true, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java
License:Apache License
/** * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 parameters. All * output data will be written to the output directory, which will be initially deleted if it exists. The clustered * points will reside in the path <output>/clustered-points. By default, the job expects the a file containing * synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named * "testdata", and writes output to a directory named "output". * /* w w w . ja va 2 s .c o m*/ * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold */ private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * * @param conf the Configuration to use//w ww . j a v a2s . c om * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param k the number of clusters in Kmeans * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, "random-seeds"); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); System.out.println("****************************************************************************"); log.info("Running KMeans with k = {}", k); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper Path outGlob = new Path(output, "clusters-*-final"); Path clusteredPoints = new Path(output, "clusteredPoints"); log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); clusterDumper.printClusters(null); FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf); IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { System.out.println(value.toString() + " belongs to cluster " + key.toString()); } reader.close(); }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given * distance measure, t1, t2 and iteration parameters. All output data will * be written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output".//from www.ja v a2 s . c om * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, measure, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.nm.documentClustering.example.ClusterDumpjob.java
License:Apache License
public static void main(String[] args) throws Exception { String output = "target/output"; ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); // if(true){throw new RuntimeException("this");} }
From source file:com.nm.documentClustering.example.KMeansJob.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a * directory named "output"./* w w w. j a v a 2s. c o m*/ * * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param k * the number of clusters in Kmeans * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, "random-seeds"); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans with k = {}", k); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper Path outGlob = new Path(output, "clusters-*-final"); Path clusteredPoints = new Path(output, "clusteredPoints"); log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); clusterDumper.printClusters(null); }
From source file:com.nm.documentClustering.example.KMeansJob.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata", * and writes output to a directory named "output". * //from w w w. ja v a2s . c om * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:hbaseworkshop.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param k the number of clusters in Kmeans * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */// ww w. jav a 2 s. c o m public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); // KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, // measure, convergenceDelta, maxIterations, true, false); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, 0.5, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:hbaseworkshop.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance * measure, t1, t2 and iteration parameters. All output data will be written * to the output directory, which will be initially deleted if it exists. The * clustered points will reside in the path <output>/clustered-points. By * default, the job expects the a file containing synthetic_control.data as * obtained from//from w w w. j ava 2 s.c om * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output". * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, 0.5, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, maxIterations, true, 0.5, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }