List of usage examples for org.apache.mahout.clustering.canopy CanopyDriver run
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException
From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java
License:Apache License
/** * Run the canopy clustering job on an input dataset using the given distance measure, t1 and t2 parameters. All * output data will be written to the output directory, which will be initially deleted if it exists. The clustered * points will reside in the path <output>/clustered-points. By default, the job expects the a file containing * synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named * "testdata", and writes output to a directory named "output". * /* ww w . j av a 2s . c o m*/ * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold */ private static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, output, measure, t1, t2, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-0-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given * distance measure, t1, t2 and iteration parameters. All output data will * be written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output".// w ww . j av a 2 s . co m * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, measure, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.nm.documentClustering.example.KMeansJob.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata", * and writes output to a directory named "output". * //from w ww. jav a 2s . c o m * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:hbaseworkshop.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance * measure, t1, t2 and iteration parameters. All output data will be written * to the output directory, which will be initially deleted if it exists. The * clustered points will reside in the path <output>/clustered-points. By * default, the job expects the a file containing synthetic_control.data as * obtained from/*from w w w .ja va2 s. c om*/ * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series * resides in a directory named "testdata", and writes output to a directory * named "output". * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param t1 the canopy T1 threshold * @param t2 the canopy T2 threshold * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, 0.5, false); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR), output, measure, convergenceDelta, maxIterations, true, 0.5, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:org.conan.mymahout.clustering.syntheticcontrol.fuzzykmeans.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file * containing synthetic_control.data as obtained from * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata", * and writes output to a directory named "output". * //from w w w . j a va2 s. c o m * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param t1 * the canopy T1 threshold * @param t2 * the canopy T2 threshold * @param maxIterations * the int maximum number of iterations * @param fuzziness * the float "m" fuzziness coefficient * @param convergenceDelta * the double convergence criteria for iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, int maxIterations, float fuzziness, double convergenceDelta) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running Canopy to get initial clusters"); Path canopyOutput = new Path(output, "canopies"); CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2, false, 0.0, false); log.info("Running FuzzyKMeans"); FuzzyKMeansDriver.run(directoryContainingConvertedInput, new Path(canopyOutput, "clusters-0-final"), output, convergenceDelta, maxIterations, fuzziness, true, true, 0.0, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }