Example usage for org.apache.mahout.clustering.canopy CanopyDriver run

List of usage examples for org.apache.mahout.clustering.canopy CanopyDriver run

Introduction

In this page you can find the example usage for org.apache.mahout.clustering.canopy CanopyDriver run.

Prototype

public static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2,
        boolean runClustering, double clusterClassificationThreshold, boolean runSequential)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Convenience method creates new Configuration() Build a directory of Canopy clusters from the input arguments and, if requested, cluster the input vectors using these clusters

Usage

From source file:chapter5.KMeanSample.java

License:Apache License

/**
 * Run the kmeans clustering job on an input dataset using the given distance
 * measure, t1, t2 and iteration parameters. All output data will be written
 * to the output directory, which will be initially deleted if it exists. The
 * clustered points will reside in the path <output>/clustered-points. By
 * default, the job expects the a file containing synthetic_control.data as
 * obtained from//from   w w w .j a v a 2  s .c  o  m
 * http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
 * resides in a directory named "testdata", and writes output to a directory
 * named "output".
 * 
 * @param conf
 *          the Configuration to use
 * @param input
 *          the String denoting the input directory path
 * @param output
 *          the String denoting the output directory path
 * @param measure
 *          the DistanceMeasure to use
 * @param t1
 *          the canopy T1 threshold
 * @param t2
 *          the canopy T2 threshold
 * @param convergenceDelta
 *          the double convergence criteria for iterations
 * @param maxIterations
 *          the int maximum number of iterations
 */
public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1,
        double t2, double convergenceDelta, int maxIterations) throws Exception {
    Path directoryContainingConvertedInput = new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT);
    log.info("Preparing Input");
    InputDriver.runJob(input, directoryContainingConvertedInput,
            "org.apache.mahout.math.RandomAccessSparseVector");
    log.info("Running Canopy to get initial clusters");
    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, t1, t2, false, false);
    log.info("Running KMeans");
    KMeansDriver.run(conf, directoryContainingConvertedInput, new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
            output, measure, convergenceDelta, maxIterations, true, false);
    // run ClusterDumper
    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, output, maxIterations),
            new Path(output, "clusteredPoints"));
    clusterDumper.printClusters(null);
}

From source file:org.sleuthkit.hadoop.ClusterDocumentsJob.java

License:Open Source License

/**
 * Runs the clutering algorithms on the tfidf vectors that have been placed in
 * sequence files in directory 'input', and puts raw cluster/json data in
 * 'output'. Also puts json reporting data in the reports/data folder.
 * @param input The sequence files to cluster on.
 * @param output The output directory for raw canopy/kmeans cluster data.
 * @param dictionary The dictionary vector which maps the indices of the vectors
 * to words./*from   w w  w .j  a  v a2s .  c  o  m*/
 * @param t1 The t1 value for canopy clustering. The distance measure for
 * canopy is CosineDistanceMeasure, so this should be a value between 0 and 1.
 * @param t2 The t2 value for canopy clustering. Again, should be between
 * t1 and 1. A smaller distance beween the two results in more clusters;
 * a greater distance results in fewer.
 * @param imageID The hash of the image.
 * @param friendlyName The friendly, user given name of the image.
 * @param baseDir The base directory where output data for this image
 * is stored. Used to place the reporting data in the correct location.
 * @return A status code; will be non-zero if the task failed.
 */
public static int runPipeline(String input, String output, String dictionary, double t1, double t2,
        String imageID, String friendlyName, String baseDir) {
    Configuration conf = new Configuration();
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    Path canopyInputPath = new Path(input);
    Path canopyOutputPath = new Path(output + "/canopy");

    Path kmeansInputPath = new Path(input);
    Path kmeansOutputPath = new Path(output + "/kmeans");
    // Canopy (I'm quite certain) only does one pass, so the relevant
    // clusters should be found in this file. For KMeans, this may not
    // be the case. Note, though, that the final clusters with document
    // vectors will be in a different file.
    Path kmeansClusters = new Path(output + "/canopy/clusters-0");

    try {
        CanopyDriver.run(conf, canopyInputPath, canopyOutputPath, new CosineDistanceMeasure(), t1, t2, true,
                false);
    } catch (Exception e) {
        LOG.error("Failure running mahout canopy.", e);
        return 1;
    }

    // The convergencedelta and maxiterations affect how long kmeans will
    // take to run and how many times we run the algorithm before we give
    // up. The numbers we are using here seem to give reasonably good
    // results.
    try {
        KMeansDriver.run(conf, kmeansInputPath, kmeansClusters, kmeansOutputPath, new CosineDistanceMeasure(),
                .5, 20, true, false);
    } catch (Exception e) {
        LOG.error("Failure running mahout kmeans.", e);
        return 2;
    }

    try {
        ////////////////////////////////
        // Output top cluster matches //
        ////////////////////////////////
        Job job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_MATCH);
        job.setJarByClass(TopFeatureMapper.class);

        // Get the final kmeans iteration. This is sort of a pain but for
        // whatever reason hadoop has no mechanism to do this for us.
        FileSystem fs = FileSystem.get(job.getConfiguration());
        int i = 2;
        Path goodPath = new Path(output + "/kmeans/clusters-1");

        while (true) {
            Path testPath = new Path(output + "/kmeans/clusters-" + i);
            if (!fs.exists(testPath)) {
                break;
            }
            i++;
            goodPath = testPath;
        }

        FileInputFormat.setInputPaths(job, goodPath);
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/topClusters/"));

        job.setMapperClass(TopFeatureMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // We need to reduce serially.
        job.setNumReduceTasks(1);

        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.getConfiguration().set("org.sleuthkit.hadoop.dictionary", dictionary);

        job.waitForCompletion(true);

        ////////////////////////////////
        // Output Cluster->DocID JSON //
        ////////////////////////////////

        job = SKJobFactory.createJob(imageID, friendlyName, JobNames.OUTPUT_CLUSTER_JSON);
        job.setJarByClass(JSONClusterNameMapper.class);

        FileInputFormat.setInputPaths(job, new Path(output + "/kmeans/clusteredPoints/"));
        FileOutputFormat.setOutputPath(job, new Path(output + "/kmeans/jsonClusteredPoints/"));

        job.setMapperClass(JSONClusterNameMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // again, we need to reduce serially. We are crafting a single json object and so we must
        // have exactly one output file.
        job.setNumReduceTasks(1);
        job.setReducerClass(JSONArrayReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);

        // Note that, since we limit the number of reduce tasks to 1, there should only be
        // one reduce 'part'.

        ClusterJSONBuilder.buildReport(new Path(output + "/kmeans/topClusters/part-r-00000"),
                new Path(output + "/kmeans/jsonClusteredPoints/part-r-00000"),
                new Path(baseDir + "/reports/data/documents.js"));
        return 0;
    } catch (IOException ex) {
        LOG.error("Failure while performing HDFS file IO.", ex);
    } catch (ClassNotFoundException ex) {
        LOG.error("Error running job; class not found.", ex);
    } catch (InterruptedException ex) {
        LOG.error("Hadoop job interrupted.", ex);
    }
    // we have failed; return non-zero error code.
    return 3;

}

From source file:sigis.pighout.CanopyCluster.java

private void writeInitialCentroidsCanopy(final Configuration conf, String POINTS_PATH, String CLUSTER_PATH)
        throws IOException, InterruptedException, ClassNotFoundException {

    CanopyDriver.run(new Path(POINTS_PATH), new Path(CLUSTER_PATH), new EuclideanDistanceMeasure(), (float) 3.1,
            (float) 2.1, false, 0.5, false);

    System.out.println(Cluster.FINAL_ITERATION_SUFFIX);
    System.out.println(Cluster.CLUSTERED_POINTS_DIR);
}