Example usage for org.apache.mahout.clustering.kmeans KMeansDriver run

List of usage examples for org.apache.mahout.clustering.kmeans KMeansDriver run

Introduction

In this page you can find the example usage for org.apache.mahout.clustering.kmeans KMeansDriver run.

Prototype

public static void run(Path input, Path clustersIn, Path output, double convergenceDelta, int maxIterations,
        boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to cluster the input vectors.

Usage

From source file:DisplayKMeans.java

License:Apache License

private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int numClusters, int maxIterations, double convergenceDelta)
        throws IOException, InterruptedException, ClassNotFoundException {
    Path clustersIn = new Path(output, "random-seeds");
    RandomSeedGenerator.buildRandom(conf, samples, clustersIn, numClusters, measure);
    KMeansDriver.run(samples, clustersIn, output, convergenceDelta, maxIterations, true, 0.0, true);
    loadClustersWritable(output);/* ww w.j a  v  a2  s  . co  m*/
}

From source file:io.github.thushear.display.DisplayKMeans.java

License:Apache License

private static void runSequentialKMeansClusterer(Configuration conf, Path samples, Path output,
        DistanceMeasure measure, int maxIterations)
        throws IOException, InterruptedException, ClassNotFoundException {
    Path clusters = RandomSeedGenerator.buildRandom(conf, samples, new Path(output, "clusters-0"), 3, measure);
    double distanceThreshold = 0.001;
    KMeansDriver.run(samples, clusters, output, measure, distanceThreshold, maxIterations, true, true);
    loadClusters(output);//  www.jav  a  2 s  .co  m
}

From source file:net.aprendizajengrande.ontocluster.Clusterer.java

License:Open Source License

public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {

    if (args.length != 3) {
        System.err.println(//from ww  w  .j  a  va 2 s .  c  om
                "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>");
        System.exit(1);
    }

    Configuration conf = new Configuration();
    DistanceMeasure measure = new CosineDistanceMeasure();
    long seed = 67241;
    int numClusters = 250;
    int numIterations = 500;

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    // crear vectores en HDFS
    System.out.println("Input: " + args[0]);
    Path input = new Path(args[0] + "/input");

    // first centroids are an input parameter to clustering
    Path clusters = new Path(args[0] + "/clusters");
    clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed);

    Path output = new Path(args[1]);

    // cluster
    KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false);

    // read the rel names, to pretty print

    Path inputRels = new Path(args[0] + "/rels");
    FileSystem fs = inputRels.getFileSystem(conf);
    FSDataInputStream fsdis = fs.open(inputRels);
    BufferedReader br = new BufferedReader(new InputStreamReader(fsdis));
    String line = br.readLine();
    Map<Integer, String> relIdToName = new HashMap<>();
    while (line != null) {
        String[] parts = line.split("\\t");
        relIdToName.put(Integer.parseInt(parts[0]), parts[1]);
        line = br.readLine();
    }

    // read output
    Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf);
    if (outputFinal == null) {
        System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'");
        System.exit(1);
    }
    Path successFile = new Path(outputFinal, "_SUCCESS");
    if (fs.exists(successFile)) {
        fs.delete(successFile, false);
    }

    SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST,
            conf);

    PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2])));

    int clusterNum = 0;
    for (Pair<Text, Writable> p : it) {
        Object obj = p.getSecond();
        if (!(obj instanceof ClusterWritable))
            continue;
        pw.println(clusterNum + ") " + p.getFirst());
        Cluster cluster = ((ClusterWritable) obj).getValue();
        Vector center = cluster.getCenter();
        for (int i = 0; i < center.size(); i++) {
            String name = relIdToName.get(i);
            if (name == null)
                name = "?";
            if (center.get(i) >= 0.01)
                pw.println("\t" + name + ": " + center.get(i));
        }
        pw.println();
        clusterNum++;
    }
    pw.close();
}