Example usage for org.apache.mahout.clustering.kmeans KMeansDriver run

List of usage examples for org.apache.mahout.clustering.kmeans KMeansDriver run

Introduction

In this page you can find the example usage for org.apache.mahout.clustering.kmeans KMeansDriver run.

Prototype

public static void run(Configuration conf, Path input, Path clustersIn, Path output, double convergenceDelta,
        int maxIterations, boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to cluster the input vectors.

Usage

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the
 * number of clusters k and iteration parameters. All output data will be
 * written to the output directory, which will be initially deleted if it
 * exists. The clustered points will reside in the path
 * <output>/clustered-points. By default, the job expects a file containing
 * equal length space delimited data that resides in a directory named
 * "testdata", and writes output to a directory named "output".
 * //from  w w  w.j ava  2s. c  om
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, measure, convergenceDelta,
            maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//from  w  w w.ja v  a2 s . c  om
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given the number of clusters k and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects a file
 * containing equal length space delimited data that resides in a directory named "testdata", and writes output to a
 * directory named "output".//from www.  j av  a  2 s  . co m
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param k
 *          the number of clusters in Kmeans
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, int k,
        double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running random seed to get initial clusters");
    Path clusters = new Path(output, "random-seeds");
    clusters = RandomSeedGenerator.buildRandom(conf, directoryContainingConvertedInput, clusters, k, measure);
    log.info("Running KMeans with k = {}", k);
    KMeansDriver.run(conf, directoryContainingConvertedInput, clusters, output, convergenceDelta, maxIterations,
            true, 0.0, false);
    // run ClusterDumper
    Path outGlob = new Path(output, "clusters-*-final");
    Path clusteredPoints = new Path(output, "clusteredPoints");
    log.info("Dumping out clusters from clusters: {} and clusteredPoints: {}", outGlob, clusteredPoints);
    ClusterDumper clusterDumper = new ClusterDumper(outGlob, clusteredPoints);
    clusterDumper.printClusters(null);
}

From source file:com.nm.documentClustering.example.KMeansJob.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance measure, t1, t2 and iteration
 * parameters. All output data will be written to the output directory, which will be initially deleted if it exists.
 * The clustered points will reside in the path <output>/clustered-points. By default, the job expects the a file
 * containing synthetic_control.data as obtained from
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series resides in a directory named "testdata",
 * and writes output to a directory named "output".
 * //from   ww  w. ja  v a2 s  . com
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    Path canopyOutput = new Path(output, "canopies");
    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, canopyOutput, measure, t1, t2,
            false, 0.0, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput,
            new Path(canopyOutput, Cluster.INITIAL_CLUSTERS_DIR + "-final"), output, convergenceDelta,
            maxIterations, true, 0.0, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, "clusters-*-final"),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:com.queirozf.clustering.MahoutKMeans.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    int k = Integer.parseInt(args[2]);

    double epsilon = 0.001;
    int maxIterations = 10000;

    Configuration conf = this.getConf();

    DistanceMeasure measure = new EuclideanDistanceMeasure();

    Path centroids = RandomSeedGenerator.buildRandom(conf, in, new Path(out, "data/clusters"), k, measure);

    KMeansDriver.run(conf, in, centroids, out, epsilon, maxIterations, true, 0.0, false);

    return 0;//from  w  w w  .  j  ava 2 s. c  o  m
}

From source file:hk.newsRecommender.MatrixAndCluster.java

License:Open Source License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String hdfsUrl = conf.get("fs.defaultFS");

    //      part1---------------------------------------------------------------
    //      Job job0 = Job.getInstance(conf, "siftKeywordsDimension");
    //      Path output1Path=new Path(hdfsUrl + "/data/recommend/matrix1");
    //      HadoopUtil.delete(conf, output1Path);
    //      job0.setJarByClass(TFIDF.class);
    //      job0.setMapperClass(Mapper_Part1.class);
    //      job0.setReducerClass(Reduce_Part1.class);
    //      job0.setMapOutputKeyClass(Text.class);
    //      job0.setMapOutputValueClass(Text.class);
    //      job0.setOutputKeyClass(Text.class);
    //      job0.setOutputValueClass(Text.class);
    //      job0.setPartitionerClass(CustomPartitioner.class);
    //      FileInputFormat.addInputPath(job0, new Path(hdfsUrl + "/data/recommend/tfidf3"));
    //      FileOutputFormat.setOutputPath(job0, output1Path);
    //      job0.waitForCompletion(true);

    //      part2---------------------------------------------------------------
    //      FileSystem fsopen = FileSystem.get(conf);
    //      FSDataInputStream in = fsopen.open(new Path(hdfsUrl + "/data/recommend/matrix1/part-r-00000"));
    //      Scanner scan = new Scanner(in);
    //      List<String> keywordList=new ArrayList<String>();
    //      while (scan.hasNext()) {
    //         keywordList.add(scan.next());
    //      }//from  ww w.j av a  2s .com
    ////      must before job
    //      conf.setStrings("keyword", keywordList.toArray(new String[keywordList.size()]));
    //      Job job1 = Job.getInstance(conf, "generateMatrix");
    //      Path output2Path=new Path(hdfsUrl + "/data/recommend/matrix2");
    //      HadoopUtil.delete(conf, output2Path);
    //      job1.setJarByClass(TFIDF.class);
    //      job1.setMapperClass(Mapper_Part2.class);
    //      job1.setReducerClass(Reduce_Part2.class);
    //      job1.setMapOutputKeyClass(Text.class);
    //      job1.setMapOutputValueClass(Text.class);
    //      job1.setOutputKeyClass(Text.class);
    //      job1.setOutputValueClass(NullWritable.class);
    ////      job1.addCacheFile(new Path("/data/recommend/matrix1/part-r-00000").toUri());
    //      FileInputFormat.addInputPath(job1, new Path(hdfsUrl + "/data/recommend/tfidf3"));
    //      FileOutputFormat.setOutputPath(job1, output2Path);
    //      job1.waitForCompletion(true);

    //      part3-------------------??--------------------------------------------
    Path output3Path = new Path(hdfsUrl + "/data/recommend/cluster2");
    HadoopUtil.delete(conf, output3Path);
    EuclideanDistanceMeasure measure = new EuclideanDistanceMeasure();
    Path clusterInput = new Path(hdfsUrl + "/data/recommend/matrix2");
    Path clusterSeqInput = new Path(hdfsUrl + "/data/recommend/cluster1");
    Path clusterOutput = new Path(hdfsUrl + "/data/recommend/cluster2");
    int k = 10;
    int maxIter = 3;
    //      ?mahout???
    //      InputDriver.runJob(clusterInput, clusterSeqInput, "org.apache.mahout.math.RandomAccessSparseVector");
    //       ?k
    Path clusters = RandomSeedGenerator.buildRandom(conf, clusterSeqInput,
            new Path(clusterOutput, "clusters-0"), k, measure);
    KMeansDriver.run(conf, clusterSeqInput, clusters, clusterOutput, 0.01, maxIter, true, 0.0, false);
    //  ClusterDumper  printClusters ???
    ClusterDumper clusterDumper = new ClusterDumper(new Path(clusterOutput, "clusters-" + (maxIter - 1)),
            new Path(clusterOutput, "clusteredPoints"));
    clusterDumper.printClusters(null);

    clusterOutput(conf, new Path(hdfsUrl + "/data/recommend/cluster2/clusteredPoints/part-m-00000"));
    //      clusterOutput2(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster2/clusteredPoints/part-m-00000"));
    //      matrix2Vector(conf0,new Path(hdfsUrl0 + "/data/recommend/cluster1/part-m-00000"));//

}

From source file:org.sleuthkit.hadoop.ClusterDocumentsJob.java

License:Open Source License

/**
 * Runs the clutering algorithms on the tfidf vectors that have been placed in
 * sequence files in directory 'input', and puts raw cluster/json data in
 * 'output'. Also puts json reporting data in the reports/data folder.
 * @param input The sequence files to cluster on.
 * @param output The output directory for raw canopy/kmeans cluster data.
 * @param dictionary The dictionary vector which maps the indices of the vectors
 * to words.//from w ww.  j a  va2s  . c  o m
 * @param t1 The t1 value for canopy clustering. The distance measure for
 * canopy is CosineDistanceMeasure, so this should be a value between 0 and 1.
 * @param t2 The t2 value for canopy clustering. Again, should be between
 * t1 and 1. A smaller distance beween the two results in more clusters;
 * a greater distance results in fewer.
 * @param imageID The hash of the image.
 * @param friendlyName The friendly, user given name of the image.
 * @param baseDir The base directory where output data for this image
 * is stored. Used to place the reporting data in the correct location.
 * @return A status code; will be non-zero if the task failed.
 */
public static int runPipeline(String input, String output, String dictionary, double t1, double t2,
        String imageID, String friendlyName, String baseDir) {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    Path canopyInputPath = new Path(input);
    Path canopyOutputPath = new Path(output + "/canopy");

    Path kmeansInputPath = new Path(input);
    Path kmeansOutputPath = new Path(output + "/kmeans");
    // Canopy (I'm quite certain) only does one pass, so the relevant
    // clusters should be found in this file. For KMeans, this may not
    // be the case. Note, though, that the final clusters with document
    // vectors will be in a different file.
    Path kmeansClusters = new Path(output + "/canopy/clusters-0");

    try {
        CanopyDriver.run(conf, canopyInputPath, canopyOutputPath, new CosineDistanceMeasure(), t1, t2, true,
                false);
    } catch (Exception e) {
        LOG.error("Failure running mahout canopy.", e);
        return 1;
    }

    // The convergencedelta and maxiterations affect how long kmeans will
    // take to run and how many times we run the algorithm before we give
    // up. The numbers we are using here seem to give reasonably good
    // results.
    try {
        KMeansDriver.run(conf, kmeansInputPath, kmeansClusters, kmeansOutputPath, new CosineDistanceMeasure(),
                .5, 20, true, false);
    } catch (Exception e) {
        LOG.error("Failure running mahout kmeans.", e);
        return 2;
    }

    try {
        ////////////////////////////////
        // Output top cluster matches //
        ////////////////////////////////
        Job job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_MATCH);
        job.setJarByClass(TopFeatureMapper.class);

        // Get the final kmeans iteration. This is sort of a pain but for
        // whatever reason hadoop has no mechanism to do this for us.
        FileSystem fs = FileSystem.get(job.getConfiguration());
        int i = 2;
        Path goodPath = new Path(output + "/kmeans/clusters-1");

        while (true) {
            Path testPath = new Path(output + "/kmeans/clusters-" + i);
            if (!fs.exists(testPath)) {
                break;
            }
            i++;
            goodPath = testPath;
        }

        FileInputFormat.setInputPaths(job, goodPath);
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/topClusters/"));

        job.setMapperClass(TopFeatureMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // We need to reduce serially.
        job.setNumReduceTasks(1);

        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.getConfiguration().set("org.sleuthkit.hadoop.dictionary", dictionary);

        job.waitForCompletion(true);

        ////////////////////////////////
        // Output Cluster->DocID JSON //
        ////////////////////////////////

        job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_JSON);
        job.setJarByClass(JSONClusterNameMapper.class);

        FileInputFormat.setInputPaths(job, new Path(output + "/kmeans/clusteredPoints/"));
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/jsonClusteredPoints/"));

        job.setMapperClass(JSONClusterNameMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // again, we need to reduce serially. We are crafting a single json object and so we must
        // have exactly one output file.
        job.setNumReduceTasks(1);
        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);

        // Note that, since we limit the number of reduce tasks to 1, there should only be
        // one reduce 'part'.

        ClusterJSONBuilder.buildReport(new Path(output + "/kmeans/topClusters/part-r-00000"),
                new Path(output + "/kmeans/jsonClusteredPoints/part-r-00000"),
                new Path(baseDir + "/reports/data/documents.js"));
        return 0;
    } catch (IOException ex) {
        LOG.error("Failure while performing HDFS file IO.", ex);
    } catch (ClassNotFoundException ex) {
        LOG.error("Error running job; class not found.", ex);
    } catch (InterruptedException ex) {
        LOG.error("Hadoop job interrupted.", ex);
    }
    // we have failed; return non-zero error code.
    return 3;

}

From source file:sigis.kmeansmultiplek.AnotherKmeans.java

private void start() throws Exception {

    final Configuration configuration = new Configuration();

    this.numberOfCluster = this.kmin;
    // Create input directories for data
    final File pointsDir = new File(POINTS_PATH);
    if (!pointsDir.exists()) {
        pointsDir.mkdir();/*from w w w.j a v a2s  .  c o  m*/
    }
    // read the point values and generate vectors from input data

    // Write data to sequence hadoop sequence files
    List<DenseVector> vectors = toDenseVector(configuration);

    // Write initial centers for clusters
    writeClusterInitialCenters(configuration, vectors);

    // Run K-means algorithm
    final Path inputPath = new Path(POINTS_PATH);
    final Path clustersPath = new Path(CLUSTERS_PATH);
    final Path outputPath = new Path(OUTPUT_PATH);
    HadoopUtil.delete(configuration, outputPath);

    KMeansDriver.run(configuration, inputPath, clustersPath, outputPath, 0.001, 10, true, 0, false);

    // Read and print output values
    readAndPrintOutputValues(configuration);
}

From source file:sigis.pighout.KmeansClustering.java

private void performKmeans(final Configuration conf, String POINTS_PATH, String CLUSTER_PATH,
        String OUTPUT_PATH) throws IOException, InterruptedException, ClassNotFoundException {

    KMeansDriver.run(conf, new Path(POINTS_PATH), new Path(CLUSTER_PATH + CLUSTER_EXT), new Path(OUTPUT_PATH),
            0.001, 10, true, 0, false);/*from  w  w w  .ja  v a2  s . c  o m*/
}