Example usage for org.apache.mahout.clustering.kmeans RandomSeedGenerator buildRandom

List of usage examples for org.apache.mahout.clustering.kmeans RandomSeedGenerator buildRandom

Introduction

In this page you can find the example usage for org.apache.mahout.clustering.kmeans RandomSeedGenerator buildRandom.

Prototype

public static Path buildRandom(Configuration conf, Path input, Path output, int k, DistanceMeasure measure)
            throws IOException 

Source Link

Usage

From source file:DisplayKMeans.java

License:Apache License

private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
        throws IOException, InterruptedException, ClassNotFoundException {
    Path clustersIn = new Path(output, "random-seeds");
    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
    KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
    loadClustersWritable(output);//from   ww w.j a v a2 s  .c om
}

From source file:DisplayFuzzyKMeans.java

License:Apache License

private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int maxIterations, float m, double threshold)
        throws IOException, ClassNotFoundException, InterruptedException {
    Path clustersIn = new Path(output, "random-seeds");
    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, 3, measure);
    FuzzyKMeansDriver.run(samples, clustersIn, output, threshold, maxIterations, m, true, true, threshold,
            true);//  ww  w . j  a v a 2s. c  o  m

    loadClustersWritable(output);
}

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * /*from   w  w  w .j  a  v a 2  s .  c om*/
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf the Configuration to use//from   w  ww  .  j a  v  a  2s.  com
 * @param input the String denoting the input directory path
 * @param output the String denoting the output directory path
 * @param measure the DistanceMeasure to use
 * @param k the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    System.out.println("****************************************************************************");

    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);

    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs,
            new Path("output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"), conf);
    IntWritable key = new IntWritable();
    WeightedVectorWritable value = new WeightedVectorWritable();
    while (reader.next(key, value)) {
        System.out.println(value.toString() + " belongs to cluster " + key.toString());
    }
    reader.close();
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
 * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
 * directory named "output"./*from w  w  w  .  ja  v  a 2  s  .co m*/
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations,
            true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);
}

From source file:com.queirozf.clustering.MahoutKMeans.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    int k = Integer.parseInt(args[2]);

    double epsilon = 0.001;
    int maxIterations = 10000;

    Configuration conf = this.getConf();

    DistanceMeasure measure = new EuclideanDistanceMeasure();

    Path centroids = RandomSeedGenerator.buildRandom(conf, in, new Path(out, "data/clusters"), k, measure);

    KMeansDriver.run(conf, in, centroids, out, epsilon, maxIterations, true, 0.0, false);

    return 0;/*from   www.j a v  a2 s  .c  o  m*/
}

From source file:hbaseworkshop.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 *
 * @param conf             the Configuration to use
 * @param input            the String denoting the input directory path
 * @param output           the String denoting the output directory path
 * @param measure          the DistanceMeasure to use
 * @param k                the number of clusters in Kmeans
 * @param convergenceDelta the double convergence criteria for iterations
 * @param maxIterations    the int maximum number of iterations
 *///from www.  ja va  2 s.  c o  m
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    //        KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output,
    //                measure, convergenceDelta, maxIterations, true, false);

    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, 0.5, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part1---------------------------------------------------------------
    //      Job job0 = Job.getInstance(conf, "siftKeywordsDimension");
    //      Path output1Path=new Path(hdfsUrl + "/data/recommend/matrix1");
    //      HadoopUtil.delete(conf, output1Path);
    //      job0.setJarByClass(TFIDF.class);
    //      job0.setMapperClass(Mapper_Part1.class);
    //      job0.setReducerClass(Reduce_Part1.class);
    //      job0.setMapOutputKeyClass(Text.class);
    //      job0.setMapOutputValueClass(Text.class);
    //      job0.setOutputKeyClass(Text.class);
    //      job0.setOutputValueClass(Text.class);
    //      job0.setPartitionerClass(CustomPartitioner.class);
    //      FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/tfidf3"));
    //      FileOutputFormat.setOutputPath(job0, output1Path);
    //      job0.waitForCompletion(true);

    //      part2---------------------------------------------------------------
    //      FileSystem fsopen = FileSystem.get(conf);
    //      FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000"));
    //      Scanner scan = new Scanner(in);
    //      List<String> keywordList=new ArrayList<String>();
    //      while (scan.hasNext()) {
    //         keywordList.add(scan.next());
    //      }//from ww  w  .  jav a2s  .  c  o  m
    ////      must before job
    //      conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()]));
    //      Job job1 = Job.getInstance(conf, "generateMatrix");
    //      Path output2Path=new Path(hdfsUrl + "/data/recommend/matrix2");
    //      HadoopUtil.delete(conf, output2Path);
    //      job1.setJarByClass(TFIDF.class);
    //      job1.setMapperClass(Mapper_Part2.class);
    //      job1.setReducerClass(Reduce_Part2.class);
    //      job1.setMapOutputKeyClass(Text.class);
    //      job1.setMapOutputValueClass(Text.class);
    //      job1.setOutputKeyClass(Text.class);
    //      job1.setOutputValueClass(NullWritable.class);
    ////      job1.addCacheFile(new Path("/data/recommend/matrix1/part-r-00000").toUri());
    //      FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf3"));
    //      FileOutputFormat.setOutputPath(job1, output2Path);
    //      job1.waitForCompletion(true);

    //      part3-------------------??--------------------------------------------
    Path output3Path = new Path(hdfsUrl + "/data/recommend/cluster2");
    HadoopUtil.delete(conf, output3Path);
    EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
    Path clusterInput = new Path(hdfsUrl + "/data/recommend/matrix2");
    Path clusterSeqInput = new Path(hdfsUrl + "/data/recommend/cluster1");
    Path clusterOutput = new Path(hdfsUrl + "/data/recommend/cluster2");
    int k = 10;
    int maxIter = 3;
    //      ?mahout???
    //      InputDriver.runJob(clusterInput, clusterSeqInput, "org.apache.mahout.math.RandomAccessSparseVector");
    //       ?k
    Path clusters = RandomSeedGenerator.buildRandom(conf, clusterSeqInput,
            new Path(clusterOutput, "clusters-0"), k, measure);
    KMeansDriver.run(conf, clusterSeqInput, clusters, clusterOutput, 0.01, maxIter, true, 0.0, false);
    //  ClusterDumper  printClusters ???
    ClusterDumper clusterDumper = new ClusterDumper(new Path(clusterOutput, "clusters-" + (maxIter - 1)),
            new Path(clusterOutput, "clusteredPoints"));
    clusterDumper.printClusters(null);

    clusterOutput(conf, new Path(hdfsUrl + "/data/recommend/cluster2/clusteredPoints/part-m-00000"));
    //      clusterOutput2(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster2/clusteredPoints/part-m-00000"));
    //      matrix2Vector(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster1/part-m-00000"));//

}

From source file:io.github.thushear.display.DisplayFuzzyKMeans.java

License:Apache License

private static void runSequentialFuzzyKClusterer(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int maxIterations)
        throws IOException, ClassNotFoundException, InterruptedException {
    Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure);
    double threshold = 0.001;
    int m = 3;//w ww . java2  s.  c  om
    FuzzyKMeansDriver.run(samples, clusters, output, measure, threshold, maxIterations, m, true, true,
            threshold, true);

    loadClusters(output);
}

From source file:io.github.thushear.display.DisplayKMeans.java

License:Apache License

private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int maxIterations)
        throws IOException, InterruptedException, ClassNotFoundException {
    Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure);
    double distanceThreshold = 0.001;
    KMeansDriver.run(samples, clusters, output, measure, distanceThreshold, maxIterations, true, true);
    loadClusters(output);/* www.  ja  v  a 2  s . c  o  m*/
}