weka.distributed.spark.CanopyClustererSparkJob.java Source code

Java tutorial

Introduction

Here is the source code for weka.distributed.spark.CanopyClustererSparkJob.java

Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    CanopyClustererSparkJob
 *    Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed.spark;

import java.io.File;
import java.io.IOException;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.storage.StorageLevel;

import weka.clusterers.Canopy;
import weka.clusterers.Clusterer;
import weka.clusterers.FilteredClusterer;
import weka.clusterers.InstanceWithCanopyAssignments;
import weka.core.*;
import weka.distributed.CSVToARFFHeaderReduceTask;
import weka.distributed.CanopyMapTask;
import weka.distributed.CanopyReduceTask;
import weka.distributed.DistributedWekaException;
import weka.distributed.clusterers.CanopyAssigner;
import weka.distributed.clusterers.CanopyBuilder;
import weka.filters.Filter;
import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;

/**
 * Spark job for training a Canopy clusterer
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class CanopyClustererSparkJob extends SparkJob implements CommandlineRunnable {

    /**
     * The subdirectory of the output directory that this job saves its results to
     */
    protected static final String OUTPUT_SUBDIR = "canopy";
    /** For serialization */
    private static final long serialVersionUID = 7905426811312121221L;
    /** Default name for the model */
    protected String m_modelName = "outputModel.model";

    protected String m_wekaCsvToArffMapTaskOpts = "";
    protected String m_canopyMapTaskOpts = "";

    protected ArffHeaderSparkJob m_arffHeaderJob = new ArffHeaderSparkJob();

    /** True if the data is to be randomly shuffled and stratified first */
    protected boolean m_randomizeAndStratify;

    /** Options for the randomize/stratify job */
    protected String m_randomizeJobOpts = "";

    /** Randomize and stratify job */
    protected RandomizedDataSparkJob m_randomizeSparkJob = new RandomizedDataSparkJob();

    protected String m_aggregationT1 = "" + Canopy.DEFAULT_T1;
    protected String m_aggregationT2 = "-0.5";

    /** Whether to assign the canopies to the training data */
    protected boolean m_assignCanopiesToTrainingData;

    /** This can be either a Canopy clusterer or a FilteredClusterer */
    protected Clusterer m_finalClusterer;

    /** The full path to the final model (local or HDFS file system) */
    protected String m_pathToAggregatedCanopy = "";

    /** Maximum number of clusters to emerge from the reduce phase */
    protected String m_maxNumClustersReducePhase = "2";

    /** Holds the new RDD of instances with canopy assignments */
    protected JavaRDD<InstanceWithCanopyAssignments> m_canopiesAssigned;

    /**
     * Constructor
     */
    public CanopyClustererSparkJob() {
        super("Canopy clusterer builder job", "Build a canopy clusterer");
    }

    public static void main(String[] args) {
        CanopyClustererSparkJob ccsj = new CanopyClustererSparkJob();
        ccsj.run(ccsj, args);
    }

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        result.add(new Option("\tCreate data splits with the order of the input instances\n\t"
                + "shuffled randomly. Also stratifies the data if the class\n\t"
                + "is nominal. Works in conjunction with -min-slices; can\n\t"
                + "alternatively use -num-instances-per-slice.", "randomize", 0, "-randomize"));

        result.add(new Option(
                "\tName of output model file. Model will be\n\t"
                        + "written to output-path/canopy/model/<model name>",
                "model-file-name", 1, "-model-file-name <model-name>"));

        CanopyMapTask tempCanopy = new CanopyMapTask();
        Enumeration<Option> canopyOpts = tempCanopy.listOptions();
        while (canopyOpts.hasMoreElements()) {
            result.add(canopyOpts.nextElement());
        }

        result.add(new Option("\tMaximum number of canopies to result from\n\t" + "the reduce phase (default = 2)",
                "-max-reduce-canopies", 1, "-max-reduce-canopies <num>"));

        result.add(new Option("\tThe T2 distance to use in the reduce phase. A\n\t"
                + "value < 0 is taken as a positive multiplier for the standard\n\t"
                + "deviation-based T2 heuristic (default = -0.5)", "t2-reduce", 1, "-t2-reduce <num>"));

        result.add(new Option(
                "\tThe T1 distance to use in the reduce phase. A\n\t"
                        + "value < 0 is taken as a positive multiplier for T2 (default = -1.5)",
                "t1-reduce", 1, "-t1-reduce <num>"));

        result.add(new Option("\tAssign canopies to each training instance after clustering is complete",
                "assign-canopies", 0, "-assign-canopies"));

        RandomizedDataSparkJob tempRJob = new RandomizedDataSparkJob();
        Enumeration<Option> randOpts = tempRJob.listOptions();
        while (randOpts.hasMoreElements()) {
            result.add(randOpts.nextElement());
        }

        return result.elements();
    }

    @Override
    public String[] getOptions() {
        List<String> options = new ArrayList<String>();

        options.add("-model-file-name");
        options.add(getModelFileName());

        if (getRandomizeAndStratify()) {
            options.add("-randomize");
        }

        options.add("-max-reduce-canopies");
        options.add(getMaxNumCanopiesReducePhase());

        options.add("-t2-reduce");
        options.add(getT2ReducePhase());

        options.add("-t1-reduce");
        options.add(getT1ReducePhase());

        if (getAssignCanopiesToTrainingData()) {
            options.add("-assign-canopies");
        }

        if (!DistributedJobConfig.isEmpty(getCanopyMapTaskOptions())) {
            try {
                String[] canopyOpts = Utils.splitOptions(getCanopyMapTaskOptions());

                for (String s : canopyOpts) {
                    options.add(s);
                }
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }

        if (!DistributedJobConfig.isEmpty(getCSVMapTaskOptions())) {
            try {
                String[] csvOpts = Utils.splitOptions(getCSVMapTaskOptions());

                for (String s : csvOpts) {
                    options.add(s);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        if (!DistributedJobConfig.isEmpty(getRandomizedJobOptions())) {
            try {
                String[] csvOpts = Utils.splitOptions(getRandomizedJobOptions());

                for (String s : csvOpts) {
                    options.add(s);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return options.toArray(new String[options.size()]);
    }

    @Override
    public void setOptions(String[] options) throws Exception {

        String modelFileName = Utils.getOption("model-file-name", options);
        if (!DistributedJobConfig.isEmpty(modelFileName)) {
            setModelFileName(modelFileName);
        }

        setRandomizeAndStratify(Utils.getFlag("randomize", options));

        String temp = Utils.getOption("max-reduce-canopies", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setMaxNumCanopiesReducePhase(temp);
        }

        temp = Utils.getOption("t2-reduce", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setT2ReducePhase(temp);
        }

        temp = Utils.getOption("t1-reduce", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setT1ReducePhase(temp);
        }

        setAssignCanopiesToTrainingData(Utils.getFlag("assign-canopies", options));

        CanopyMapTask tempCanopy = new CanopyMapTask();
        tempCanopy.setOptions(options);
        String canopyOpts = Utils.joinOptions(tempCanopy.getOptions());
        if (!DistributedJobConfig.isEmpty(canopyOpts)) {
            setCanopyMapTaskOptions(canopyOpts);
        }

        // copy the options at this point so that we can set
        // general hadoop configuration for the ARFF header
        // job
        String[] optionsCopy = options.clone();

        super.setOptions(options);

        // Set options for the stratify config (if necessary)
        m_randomizeSparkJob.setOptions(optionsCopy.clone());

        String optsToRandomize = Utils.joinOptions(m_randomizeSparkJob.getOptions());
        if (!DistributedJobConfig.isEmpty(optsToRandomize)) {
            setRandomizeJobOptions(optsToRandomize);
        }

        // options for the ARFF header job
        m_arffHeaderJob.setOptions(optionsCopy);
        String optsToCSVTask = Utils.joinOptions(m_arffHeaderJob.getOptions());
        if (!DistributedJobConfig.isEmpty(optsToCSVTask)) {
            setCSVMapTaskOptions(optsToCSVTask);
        }
    }

    public String getCanopyMapTaskOptions() {
        return m_canopyMapTaskOpts;
    }

    public void setCanopyMapTaskOptions(String opts) {
        m_canopyMapTaskOpts = opts;
    }

    /**
     * Get the options to the header job
     *
     * @return options to the header job
     */
    public String getCSVMapTaskOptions() {
        return m_wekaCsvToArffMapTaskOpts;
    }

    /**
     * Set the options to the header job
     *
     * @param opts options to the header job
     */
    public void setCSVMapTaskOptions(String opts) {
        m_wekaCsvToArffMapTaskOpts = opts;
    }

    /**
     * Get whether to randomize (and stratify) the input data or not
     * 
     * @return true if the input data is to be randomized and stratified
     */
    public boolean getRandomizeAndStratify() {
        return m_randomizeAndStratify;
    }

    /**
     * Set whether to randomize (and stratify) the input data or not
     *
     * @param r true if the input data is to be randomized and stratified
     */
    public void setRandomizeAndStratify(boolean r) {
        m_randomizeAndStratify = r;
    }

    /**
     * Tip text for this property
     *
     * @return tip text for this property
     */
    public String modelFileNameTipText() {
        return "The name only (not full path) that the model should be saved to";
    }

    /**
     * Get the name only for the model file
     * 
     * @return the name only (not full path) that the model should be saved to
     */
    public String getModelFileName() {
        return m_modelName;
    }

    /**
     * Set the name only for the model file
     *
     * @param m the name only (not full path) that the model should be saved to
     */
    public void setModelFileName(String m) {
        m_modelName = m;
    }

    /**
     * Set the options for the randomize/stratify task
     *
     * @param opts the options for the randomize task
     */
    public void setRandomizeJobOptions(String opts) {
        m_randomizeJobOpts = opts;
    }

    /**
     * Get the options for the randomize/stratify task
     *
     * @return the options for the randomize task
     */
    public String getRandomizedJobOptions() {
        return m_randomizeJobOpts;
    }

    /**
     * Tip text for this property
     *
     * @return tip text for this property
     */
    public String maxNumCanopiesReducePhaseTipText() {
        return "The maximum number of canopies to form in the reduce phase";
    }

    /**
     * Get the maximum number of canopies to form in the reduce phase
     *
     * @return the maximum number of canopies to form in the reduce phase
     */
    public String getMaxNumCanopiesReducePhase() {
        return m_maxNumClustersReducePhase;
    }

    /**
     * Set the maximum number of canopies to form in the reduce phase
     *
     * @param max the maximum number of canopies to form in the reduce phase
     */
    public void setMaxNumCanopiesReducePhase(String max) {
        m_maxNumClustersReducePhase = max;
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String t1ReducePhaseTipText() {
        return "The T1 distance to use in the reduce phase. Values < 0 are taken "
                + "as a positive multiplier for the T2 distance";
    }

    /**
     * Get the T1 distance to use in the reduce phase
     *
     * @return the T1 distance to use in the reduce phase
     */
    public String getT1ReducePhase() {
        return m_aggregationT1;
    }

    /**
     * Set the T1 distance to use in the reduce phase
     *
     * @param t1 the T1 distance to use in the reduce phase
     */
    public void setT1ReducePhase(String t1) {
        m_aggregationT1 = t1;
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String t2ReducePhaseTipText() {
        return "The T2 distance to use in the reduce phase. Values < 0 are taken "
                + "as a positive multiplier for the standard deviation-based heuristic T2 distance";
    }

    /**
     * Get the T2 distance to use in the reduce phase
     *
     * @return the T2 distance to use in the reduce phase
     */
    public String getT2ReducePhase() {
        return m_aggregationT2;
    }

    /**
     * Set the T2 distance to use in the reduce phase
     *
     * @param t2 the T2 distance to use in the reduce phase
     */
    public void setT2ReducePhase(String t2) {
        m_aggregationT2 = t2;
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String assignCanopiesToTrainingDataTipText() {
        return "Assign canopies to each training instance once clustering is complete";
    }

    /**
     * Get whether to assign canopies to the training data
     *
     * @return true if the canopies found are to be assigned to the training data
     *         (thus creating an new RDD)
     */
    public boolean getAssignCanopiesToTrainingData() {
        return m_assignCanopiesToTrainingData;
    }

    /**
     * Set whether to assign canopies to the training data
     *
     * @param assign true if the canopies found are to be assigned to the training
     *          data (thus creating an new RDD)
     */
    public void setAssignCanopiesToTrainingData(boolean assign) {
        m_assignCanopiesToTrainingData = assign;
    }

    protected CanopyMapTask getConfiguredMapTask() throws DistributedWekaException {
        CanopyMapTask task = new CanopyMapTask();

        if (!DistributedJobConfig.isEmpty(getCanopyMapTaskOptions())) {
            try {
                System.err.println(getCanopyMapTaskOptions());
                task.setOptions(Utils.splitOptions(environmentSubstitute(getCanopyMapTaskOptions())));
            } catch (Exception e) {
                logMessage(e);
                throw new DistributedWekaException(e);
            }
        }

        return task;
    }

    /**
     * The reduce phase of distributed canopy clustering
     *
     * @param canopies the list of individual canopy clusteres learned in the map
     *          phase
     * @return a single canopy cluster that encapsulates the final set of canopies
     */
    protected Clusterer reduceCanopies(List<Clusterer> canopies, Instances headerWithSummary)
            throws DistributedWekaException {

        int numCanopies = Integer.parseInt(environmentSubstitute(getMaxNumCanopiesReducePhase()));
        double aggT1 = Double.parseDouble(environmentSubstitute(m_aggregationT1));
        double aggT2 = Double.parseDouble(environmentSubstitute(m_aggregationT2));

        CanopyReduceTask task = new CanopyReduceTask();
        task.setMaxFinalNumCanopies(numCanopies);
        task.setAggregationT1(aggT1);
        task.setAggregationT2(aggT2);

        Clusterer result = task.reduceCanopies(canopies, headerWithSummary);
        System.out.println(result);
        return result;
    }

    /**
     * Assigns canopy membership to the instances in a dataset. Creates a new
     * RDD[InstanceWithCanopyAssignments].
     *
     * @param dataset the dataset to process
     * @param headerNoSummary the header of the data (sans summary attributes)
     * @throws Exception if a problem occurs
     */
    protected void assignCanopiesToDataset(JavaRDD<Instance> dataset, final Instances headerNoSummary)
            throws Exception {

        Canopy canopy = (Canopy) ((m_finalClusterer instanceof Canopy) ? m_finalClusterer
                : (Canopy) ((FilteredClusterer) m_finalClusterer).getClusterer());

        Filter preprocess = (m_finalClusterer instanceof FilteredClusterer)
                ? ((FilteredClusterer) m_finalClusterer).getFilter()
                : null;

        final CanopyAssigner canopyAssigner = new CanopyAssigner(headerNoSummary, getCSVMapTaskOptions(), canopy,
                preprocess);

        JavaRDD<InstanceWithCanopyAssignments> canopiesAssignedMapResults = dataset
                .mapPartitions(new FlatMapFunction<Iterator<Instance>, InstanceWithCanopyAssignments>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = -1457310019397486011L;
                    protected List<InstanceWithCanopyAssignments> m_results = new ArrayList<InstanceWithCanopyAssignments>();

                    @Override
                    public Iterable<InstanceWithCanopyAssignments> call(Iterator<Instance> split)
                            throws IOException, DistributedWekaException {

                        while (split.hasNext()) {
                            Instance current = split.next();

                            InstanceWithCanopyAssignments holder = canopyAssigner.process(current);

                            m_results.add(holder);
                        }

                        return m_results;
                    }
                }, true);

        m_canopiesAssigned = canopiesAssignedMapResults.persist(getCachingStrategy().getStorageLevel());
    }

    protected void buildClusterer(JavaRDD<Instance> dataset, final Instances headerWithSummary) throws Exception {

        final Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);

        final CanopyMapTask canopyMapTask = getConfiguredMapTask();
        final CanopyBuilder canopyBuilder = new CanopyBuilder(headerWithSummary, headerNoSummary, canopyMapTask,
                getCanopyMapTaskOptions());

        // map phase
        JavaRDD<Clusterer> canopyMap = dataset.mapPartitions(new FlatMapFunction<Iterator<Instance>, Clusterer>() {

            /** for serialization */
            private static final long serialVersionUID = -8219560148988983518L;

            // protected Instances m_header = new Instances(headerNoSummary, 0);
            protected List<Clusterer> m_results = new ArrayList<Clusterer>();

            // protected CSVToARFFHeaderMapTask m_rowHelper;

            @Override
            public Iterable<Clusterer> call(Iterator<Instance> split) throws IOException, DistributedWekaException {

                while (split.hasNext()) {
                    Instance currentI = split.next();
                    canopyBuilder.process(currentI);
                }

                canopyBuilder.finishedInput();
                m_results.add(canopyBuilder.getFinalizedClusterer());

                return m_results;
            }
        });

        // reduce
        List<Clusterer> canopies = canopyMap.collect();
        canopyMap.unpersist();
        canopyMap = null;
        m_finalClusterer = reduceCanopies(canopies, headerWithSummary);

        if (getAssignCanopiesToTrainingData()) {
            assignCanopiesToDataset(dataset, headerNoSummary);
        }

        dataset.unpersist();
    }

    @SuppressWarnings("unchecked")
    @Override
    public boolean runJobWithContext(JavaSparkContext sparkContext) throws IOException, DistributedWekaException {

        m_currentContext = sparkContext;
        setJobStatus(JobStatus.RUNNING);
        boolean success = true;

        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        JavaRDD<Instance> dataSet = null;
        Instances headerWithSummary = null;
        if (getDataset(TRAINING_DATA) != null) {
            dataSet = ((Dataset<Instance>) getDataset(TRAINING_DATA)).getDataset();
            headerWithSummary = getDataset(TRAINING_DATA).getHeaderWithSummary();
            logMessage("RDD<Instance> dataset provided: " + dataSet.partitions().size() + " partitions.");
        }

        if (dataSet == null && headerWithSummary == null) {
            logMessage("Invoking ARFF Job...");
            m_arffHeaderJob.setEnvironment(m_env);
            m_arffHeaderJob.setLog(getLog());
            m_arffHeaderJob.setStatusMessagePrefix(m_statusMessagePrefix);
            m_arffHeaderJob.setCachingStrategy(getCachingStrategy());

            // header job necessary?
            success = m_arffHeaderJob.runJobWithContext(sparkContext);

            if (!success) {
                setJobStatus(JobStatus.FAILED);
                statusMessage("Unable to continue - creating the ARFF header failed!");
                logMessage("Unable to continue - creating the ARFF header failed!");
                return false;
            }

            Dataset<Instance> d = (Dataset<Instance>) m_arffHeaderJob.getDataset(TRAINING_DATA);
            headerWithSummary = d.getHeaderWithSummary();
            dataSet = d.getDataset();
            setDataset(TRAINING_DATA, new Dataset<Instance>(dataSet, headerWithSummary));
            logMessage("Fetching RDD<Instance> dataset from ARFF job: " + dataSet.partitions().size()
                    + " partitions.");
        }

        Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);

        // Make sure that we save out to a subdirectory of the output
        // directory
        String outputPath = environmentSubstitute(m_sjConfig.getOutputDir());
        outputPath = addSubdirToPath(outputPath, OUTPUT_SUBDIR);
        String outputModel = addSubdirToPath(outputPath, "model");
        String outputCanopyAssignments = addSubdirToPath(outputPath, "canopyAssignments");

        // clean the output directory
        SparkJob.deleteDirectory(outputPath);

        // serialized input is assumed to already be randomized...
        if (getRandomizeAndStratify() /* && !getSerializedInput() */) {
            m_randomizeSparkJob.setDefaultToLastAttIfClassNotSpecified(false);
            m_randomizeSparkJob.setEnvironment(m_env);
            m_randomizeSparkJob.setLog(getLog());
            m_randomizeSparkJob.setStatusMessagePrefix(m_statusMessagePrefix);
            m_randomizeSparkJob.setCachingStrategy(getCachingStrategy());
            m_randomizeSparkJob.setDataset(TRAINING_DATA, new Dataset<Instance>(dataSet, headerWithSummary));

            if (!m_randomizeSparkJob.runJobWithContext(sparkContext)) {
                statusMessage("Unable to continue - randomization/stratification of input data failed!");
                logMessage("Unable to continue - randomization/stratification of input data failed!");
                return false;
            }

            logMessage("Runing Canopy job...");

            Dataset<Instance> d = (Dataset<Instance>) m_randomizeSparkJob.getDataset(TRAINING_DATA);
            dataSet = d.getDataset();
            headerWithSummary = d.getHeaderWithSummary();
            setDataset(TRAINING_DATA, new Dataset<Instance>(dataSet, headerWithSummary));
        }

        try {
            buildClusterer(dataSet, headerWithSummary);
            // m_dataSet = randomized;
        } catch (Exception e) {
            logMessage(e);
            throw new DistributedWekaException(e);
        }

        outputModel += (outputModel.toLowerCase().startsWith("://") ? "/" : File.separator) + getModelFileName();

        if (m_finalClusterer != null) {
            WekaClassifierSparkJob.writeModelToDestination(m_finalClusterer, headerNoSummary, outputModel);
        }

        if (m_canopiesAssigned != null) {
            // save as object file
            m_canopiesAssigned.saveAsTextFile(outputCanopyAssignments);
        }

        setJobStatus(JobStatus.FINISHED);
        return true;
    }

    public JavaRDD<InstanceWithCanopyAssignments> getDataWithCanopiesAssigned() throws DistributedWekaException {
        if (m_canopiesAssigned == null) {
            throw new DistributedWekaException("No data with canopies assigned is available. Did you turn"
                    + "on the option to assign canopies to the data?");
        }
        return m_canopiesAssigned.persist(StorageLevel.MEMORY_AND_DISK());
    }

    protected void writeCanopyAssignments(String outputPath, JavaPairRDD<Integer, ?> assignments)
            throws IOException {
        String[] pathOnly = new String[1];
        Configuration conf = getFSConfigurationForPath(outputPath, pathOnly);

        FileSystem fs = FileSystem.get(conf);
        Path p = new Path(pathOnly[0]);
        p = p.makeQualified(fs);

        assignments.saveAsNewAPIHadoopFile(pathOnly[0], NullWritable.class, Text.class,
                SparkJob.NoKeyTextOutputFormat.class, conf);
    }

    @Override
    public void run(Object toRun, String[] options) throws IllegalArgumentException {
        if (!(toRun instanceof CanopyClustererSparkJob)) {
            throw new IllegalArgumentException("Object to run is not an CanopyClustererSparkJob!");
        }

        try {
            CanopyClustererSparkJob wcsj = (CanopyClustererSparkJob) toRun;
            if (Utils.getFlag('h', options)) {
                String help = DistributedJob.makeOptionsStr(wcsj);
                System.err.println(help);
                System.exit(1);
            }

            wcsj.setOptions(options);
            wcsj.runJob();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

}