List of usage examples for org.apache.mahout.clustering.kmeans RandomSeedGenerator buildRandom
public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure) throws IOException
From source file:DisplayKMeans.java
License:Apache License
private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output, DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { Path clustersIn = new Path(output, "random-seeds"); RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure); KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true); loadClustersWritable(output);//from ww w.j a v a2 s .c om }
From source file:DisplayFuzzyKMeans.java
License:Apache License
private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output, DistanceMeasure measure, int maxIterations, float m, double threshold) throws IOException, ClassNotFoundException, InterruptedException { Path clustersIn = new Path(output, "random-seeds"); RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure); FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold, true);// ww w . j a v a 2s. c o m loadClustersWritable(output); }
From source file:chapter5.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * /*from w w w .j a v a 2 s . c om*/ * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param k * the number of clusters in Kmeans * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * * @param conf the Configuration to use//from w ww . j a v a 2s. com * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param k the number of clusters in Kmeans * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, "random-seeds"); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); System.out.println("****************************************************************************"); log.info("Running KMeans with k = {}", k); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper Path outGlob = new Path(output, "clusters-*-final"); Path clusteredPoints = new Path(output, "clusteredPoints"); log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); clusterDumper.printClusters(null); FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf); IntWritable key = new IntWritable(); WeightedVectorWritable value = new WeightedVectorWritable(); while (reader.next(key, value)) { System.out.println(value.toString() + " belongs to cluster " + key.toString()); } reader.close(); }
From source file:com.nm.documentClustering.example.KMeansJob.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration * parameters. All output data will be written to the output directory, which will be initially deleted if it exists. * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a * directory named "output"./*from w w w . ja v a 2 s .co m*/ * * @param conf * the Configuration to use * @param input * the String denoting the input directory path * @param output * the String denoting the output directory path * @param measure * the DistanceMeasure to use * @param k * the number of clusters in Kmeans * @param convergenceDelta * the double convergence criteria for iterations * @param maxIterations * the int maximum number of iterations */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, "random-seeds"); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans with k = {}", k); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations, true, 0.0, false); // run ClusterDumper Path outGlob = new Path(output, "clusters-*-final"); Path clusteredPoints = new Path(output, "clusteredPoints"); log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints); ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints); clusterDumper.printClusters(null); }
From source file:com.queirozf.clustering.MahoutKMeans.java
License:Apache License
@Override public int run(String[] args) throws Exception { Path in = new Path(args[0]); Path out = new Path(args[1]); int k = Integer.parseInt(args[2]); double epsilon = 0.001; int maxIterations = 10000; Configuration conf = this.getConf(); DistanceMeasure measure = new EuclideanDistanceMeasure(); Path centroids = RandomSeedGenerator.buildRandom(conf, in, new Path(out, "data/clusters"), k, measure); KMeansDriver.run(conf, in, centroids, out, epsilon, maxIterations, true, 0.0, false); return 0;/*from www.j a v a2 s .c o m*/ }
From source file:hbaseworkshop.KMeanSample.java
License:Apache License
/** * Run the kmeans clustering job on an input dataset using the given the * number of clusters k and iteration parameters. All output data will be * written to the output directory, which will be initially deleted if it * exists. The clustered points will reside in the path * <output>/clustered-points. By default, the job expects a file containing * equal length space delimited data that resides in a directory named * "testdata", and writes output to a directory named "output". * * @param conf the Configuration to use * @param input the String denoting the input directory path * @param output the String denoting the output directory path * @param measure the DistanceMeasure to use * @param k the number of clusters in Kmeans * @param convergenceDelta the double convergence criteria for iterations * @param maxIterations the int maximum number of iterations *///from www. ja va 2 s. c o m public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k, double convergenceDelta, int maxIterations) throws Exception { Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT); log.info("Preparing Input"); InputDriver.runJob(input, directoryContainingConvertedInput, "org.apache.mahout.math.RandomAccessSparseVector"); log.info("Running random seed to get initial clusters"); Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR); clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure); log.info("Running KMeans"); // KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, // measure, convergenceDelta, maxIterations, true, false); KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta, maxIterations, true, 0.5, false); // run ClusterDumper ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations), new Path(output, "clusteredPoints")); clusterDumper.printClusters(null); }
From source file:hk.newsRecommender.MatrixAndCluster.java
License:Open Source License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hdfsUrl = conf.get("fs.defaultFS"); // part1--------------------------------------------------------------- // Job job0 = Job.getInstance(conf, "siftKeywordsDimension"); // Path output1Path=new Path(hdfsUrl + "/data/recommend/matrix1"); // HadoopUtil.delete(conf, output1Path); // job0.setJarByClass(TFIDF.class); // job0.setMapperClass(Mapper_Part1.class); // job0.setReducerClass(Reduce_Part1.class); // job0.setMapOutputKeyClass(Text.class); // job0.setMapOutputValueClass(Text.class); // job0.setOutputKeyClass(Text.class); // job0.setOutputValueClass(Text.class); // job0.setPartitionerClass(CustomPartitioner.class); // FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/tfidf3")); // FileOutputFormat.setOutputPath(job0, output1Path); // job0.waitForCompletion(true); // part2--------------------------------------------------------------- // FileSystem fsopen = FileSystem.get(conf); // FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000")); // Scanner scan = new Scanner(in); // List<String> keywordList=new ArrayList<String>(); // while (scan.hasNext()) { // keywordList.add(scan.next()); // }//from ww w . jav a2s . c o m //// must before job // conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()])); // Job job1 = Job.getInstance(conf, "generateMatrix"); // Path output2Path=new Path(hdfsUrl + "/data/recommend/matrix2"); // HadoopUtil.delete(conf, output2Path); // job1.setJarByClass(TFIDF.class); // job1.setMapperClass(Mapper_Part2.class); // job1.setReducerClass(Reduce_Part2.class); // job1.setMapOutputKeyClass(Text.class); // job1.setMapOutputValueClass(Text.class); // job1.setOutputKeyClass(Text.class); // job1.setOutputValueClass(NullWritable.class); //// job1.addCacheFile(new Path("/data/recommend/matrix1/part-r-00000").toUri()); // FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf3")); // FileOutputFormat.setOutputPath(job1, output2Path); // job1.waitForCompletion(true); // part3-------------------??-------------------------------------------- Path output3Path = new Path(hdfsUrl + "/data/recommend/cluster2"); HadoopUtil.delete(conf, output3Path); EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure(); Path clusterInput = new Path(hdfsUrl + "/data/recommend/matrix2"); Path clusterSeqInput = new Path(hdfsUrl + "/data/recommend/cluster1"); Path clusterOutput = new Path(hdfsUrl + "/data/recommend/cluster2"); int k = 10; int maxIter = 3; // ?mahout??? // InputDriver.runJob(clusterInput, clusterSeqInput, "org.apache.mahout.math.RandomAccessSparseVector"); // ?k Path clusters = RandomSeedGenerator.buildRandom(conf, clusterSeqInput, new Path(clusterOutput, "clusters-0"), k, measure); KMeansDriver.run(conf, clusterSeqInput, clusters, clusterOutput, 0.01, maxIter, true, 0.0, false); // ClusterDumper printClusters ??? ClusterDumper clusterDumper = new ClusterDumper(new Path(clusterOutput, "clusters-" + (maxIter - 1)), new Path(clusterOutput, "clusteredPoints")); clusterDumper.printClusters(null); clusterOutput(conf, new Path(hdfsUrl + "/data/recommend/cluster2/clusteredPoints/part-m-00000")); // clusterOutput2(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster2/clusteredPoints/part-m-00000")); // matrix2Vector(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster1/part-m-00000"));// }
From source file:io.github.thushear.display.DisplayFuzzyKMeans.java
License:Apache License
private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output, DistanceMeasure measure, int maxIterations) throws IOException, ClassNotFoundException, InterruptedException { Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure); double threshold = 0.001; int m = 3;//w ww . java2 s. c om FuzzyKMeansDriver.run(samples, clusters, output, measure, threshold, maxIterations, m, true, true, threshold, true); loadClusters(output); }
From source file:io.github.thushear.display.DisplayKMeans.java
License:Apache License
private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output, DistanceMeasure measure, int maxIterations) throws IOException, InterruptedException, ClassNotFoundException { Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure); double distanceThreshold = 0.001; KMeansDriver.run(samples, clusters, output, measure, distanceThreshold, maxIterations, true, true); loadClusters(output);/* www. ja v a 2 s . c o m*/ }