weka.distributed.spark.KMeansClustererSparkJob.java Source code

Introduction

Here is the source code for weka.distributed.spark.KMeansClustererSparkJob.java
Source

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    KMeansClustererSparkJob
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.distributed.spark;

import distributed.core.DistributedJob;
import distributed.core.DistributedJobConfig;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import weka.clusterers.CentroidSketch;
import weka.clusterers.Clusterer;
import weka.clusterers.PreconstructedFilteredClusterer;
import weka.clusterers.PreconstructedKMeans;
import weka.clusterers.SimpleKMeans;
import weka.core.Attribute;
import weka.core.CommandlineRunnable;
import weka.core.Environment;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.NormalizableDistance;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.SelectedTag;
import weka.core.Utils;
import weka.core.stats.ArffSummaryNumericMetric;
import weka.core.stats.NominalStats;
import weka.distributed.CSVToARFFHeaderMapTask;
import weka.distributed.CSVToARFFHeaderReduceTask;
import weka.distributed.DistributedWekaException;
import weka.distributed.KMeansMapTask;
import weka.distributed.KMeansReduceTask;
import weka.filters.Filter;
import weka.gui.beans.ClustererProducer;
import weka.gui.beans.TextProducer;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

/**
 * Spark job for training a k-means clusterer with or without the k-means++
 * (kmeans||) initialization procedure
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision$
 */
public class KMeansClustererSparkJob extends SparkJob
        implements CommandlineRunnable, TextProducer, ClustererProducer, OptionHandler {

    public static final String K_MEANS_MODEL = "k-means-model";

    /**
     * The subdirectory of the output directory that this job saves its results to
     */
    protected static final String OUTPUT_SUBDIR = "kmeans";

    /** For serialization */
    private static final long serialVersionUID = -7983704737099141085L;

    /** Name of the output model */
    protected String m_modelName = "outputModel.model";

    /** ARFF header job */
    protected ArffHeaderSparkJob m_arffHeaderJob = new ArffHeaderSparkJob();

    /** Options for the ARFF header job */
    protected String m_wekaCsvToArffMapTaskOpts = "";

    /**
     * True if the data is to be randomly shuffled
     */
    protected boolean m_randomize;

    /** Options for the randomize/stratify job */
    protected String m_randomizeJobOpts = "";

    /** Options for the k-means map task */
    protected String m_kMeansMapTaskOpts = "";

    /** Randomize and stratify job */
    protected RandomizedDataSparkJob m_randomizeSparkJob = new RandomizedDataSparkJob();

    /** Maximum number of iterations to run */
    protected String m_numIterations = "20";

    /** Perform 10 runs of k-means in parallel */
    protected String m_numRuns = "1";

    /** Number of clusters to find */
    protected String m_numClusters = "2";

    /** Seed for the random number generator */
    protected String m_randomSeed = "1";

    /** Number of iterations for the k-means|| initialization */
    protected String m_kMeansParallelInitSteps = "5";

    /** Close enough to have converged? */
    protected double m_convergenceTolerance = 1e-4;

    /**
     * Whether to display standard deviations of centroids in textual output of
     * final model
     */
    protected boolean m_displayStdDevs;

    /**
     * Holds priming data for distance function (if k-means|| initialization is
     * run)
     */
    protected Instances m_distanceFunctionPrimingData;

    /** Initialize with the k-means parallel routine? */
    protected boolean m_initializeWithRandomCenters;

    /** Holds the final clustering model */
    protected Clusterer m_finalClusterer;

    /** The header (sans summary attributes) used to train the clusterer with */
    protected Instances m_trainingHeader;

    public KMeansClustererSparkJob() {
        super("k-means clusterer job", "Build a k-means clusterer");
    }

    public static void main(String[] args) {
        KMeansClustererSparkJob kcsj = new KMeansClustererSparkJob();
        kcsj.run(kcsj, args);
    }

    /**
     * Help information
     *
     * @return the help information for this job
     */
    public String globalInfo() {
        return "Learns a k-means clustering using either standard random initialization "
                + "or k-means|| initialization";
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String initWithRandomCentroidsTipText() {
        return "Initialize with randomly selected centroids rather than use the "
                + "k-means|| initialization procedure";
    }

    /**
     * Get whether to initialize with randomly selected centroids rather than
     * using the k-means|| initialization procedure.
     * 
     * @return true if randomly selected initial centroids are to be used
     */
    public boolean getInitWithRandomCentroids() {
        return m_initializeWithRandomCenters;
    }

    /**
     * Set whether to initialize with randomly selected centroids rather than
     * using the k-means|| initialization procedure.
     *
     * @param init true if randomly selected initial centroids are to be used
     */
    public void setInitWithRandomCentroids(boolean init) {
        m_initializeWithRandomCenters = init;
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String convergenceToleranceTipText() {
        return "Tollerance for convergence";
    }

    /**
     * Get the convergence tolerance
     * 
     * @return the convergence tolerance
     */
    public double getConvergenceTolerance() {
        return m_convergenceTolerance;
    }

    /**
     * Set the convergence tolerance
     *
     * @param tol the convergence tolerance
     */
    public void setConvergenceTolerance(double tol) {
        m_convergenceTolerance = tol;
    }

    /**
     * Tip text for this property
     *
     * @return tip text for this property
     */
    public String modelFileNameTipText() {
        return "The name only (not full path) that the model should be saved to";
    }

    /**
     * Get the name only for the model file
     * 
     * @return the name only (not full path) that the model should be saved to
     */
    public String getModelFileName() {
        return m_modelName;
    }

    /**
     * Set the name only for the model file
     *
     * @param m the name only (not full path) that the model should be saved to
     */
    public void setModelFileName(String m) {
        m_modelName = m;
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String randomlyShuffleDataTipText() {
        return "Randomly shuffle the order of the input data";
    }

    /**
     * Get whether to randomly shuffle the order of the instances in the input
     * data before clustering
     * 
     * @return true if the data should be randomly shuffled
     */
    public boolean getRandomlyShuffleData() {
        return m_randomize;
    }

    /**
     * Set whether to randomly shuffle the order of the instances in the input
     * data before clustering
     *
     * @param r true if the data should be randomly shuffled
     */
    public void setRandomlyShuffleData(boolean r) {
        m_randomize = r;
    }

    /**
     * Tip text for this property.
     *
     * @return the tip text for this property
     */
    public String numClustersTipText() {
        return "The number of clusters to find";
    }

    /**
     * Get the number of clusters to find
     * 
     * @return the number of clusters to find
     */
    public String getNumClusters() {
        return m_numClusters;
    }

    /**
     * Set the number of clusters to find
     *
     * @param numClusters the number of clusters to find
     */
    public void setNumClusters(String numClusters) {
        m_numClusters = numClusters;
    }

    /**
     * Tip text for this property.
     *
     * @return the tip text for this property
     */
    public String numRunsTipText() {
        return "The number of k-means runs to perform in parallel (best run is selected as final model)";
    }

    /**
     * Get the number of k-means runs to perform in parallel
     * 
     * @return the number of k-means runs to perform in parallel
     */
    public String getNumRuns() {
        return m_numRuns;
    }

    /**
     * Set the number of k-means runs to perform in parallel
     *
     * @param numRuns the number of k-means runs to perform in parallel
     */
    public void setNumRuns(String numRuns) {
        m_numRuns = numRuns;
    }

    /**
     * Tip text for this property.
     *
     * @return the tip text for this property
     */
    public String numIterationsTipText() {
        return "The maximum number of k-means iterations to perform";
    }

    /**
     * Get the maximum number of k-means iterations to perform
     * 
     * @return the maximum number of iterations to perform
     */
    public String getNumIterations() {
        return m_numIterations;
    }

    /**
     * Set the maximum number of k-means iterations to perform
     *
     * @param numIts the maximum number of iterations to perform
     */
    public void setNumIterations(String numIts) {
        m_numIterations = numIts;
    }

    /**
     * Tip text for this property.
     *
     * @return the tip text for this property
     */
    public String randomSeedTipText() {
        return "Seed for random number generation";
    }

    /**
     * Get the seed for random number generation
     * 
     * @return the seed for the random number generator
     */
    public String getRandomSeed() {
        return m_randomSeed;
    }

    /**
     * Set the seed for random number generation
     *
     * @param seed the seed for the random number generator
     */
    public void setRandomSeed(String seed) {
        m_randomSeed = seed;
    }

    public String getKMeansMapTaskOpts() {
        return m_kMeansMapTaskOpts;
    }

    public void setKMeansMapTaskOpts(String opts) {
        m_kMeansMapTaskOpts = opts;
    }

    /**
     * Tip text for this property.
     *
     * @return the tip text for this property
     */
    public String kMeansParallelInitStepsTipText() {
        return "The number of iterations of the k-means|| initialization routine to perform. "
                + "Only applies if initializeFlow using random centroids has not been turned on.";
    }

    /**
     * Get the number of iterations of the k-means|| initialization routine to
     * perform
     * 
     * @return the number of iterations of the k-means|| init routine to perform
     */
    public String getKMeansParallelInitSteps() {
        return m_kMeansParallelInitSteps;
    }

    /**
     * Set the number of iterations of the k-means|| initialization routine to
     * perform
     *
     * @param steps the number of iterations of the k-means|| init routine to
     *          perform
     */
    public void setKMeansParallelInitSteps(String steps) {
        m_kMeansParallelInitSteps = steps;
    }

    /**
     * Tip text for this property
     *
     * @return the tip text for this property
     */
    public String displayCentroidStdDevsTipText() {
        return "Display centroid standard deviations in textual output of model";
    }

    /**
     * Get whether to display the standard deviations of centroids in textual
     * output of the model
     * 
     * @return true if standard deviations are to be displayed
     */
    public boolean getDisplayCentroidStdDevs() {
        return m_displayStdDevs;
    }

    /**
     * Set whether to display the standard deviations of centroids in textual
     * output of the model
     *
     * @param d true if standard deviations are to be displayed
     */
    public void setDisplayCentroidStdDevs(boolean d) {
        m_displayStdDevs = d;
    }

    /**
     * Set the options for the randomize/stratify task
     *
     * @param opts the options for the randomize task
     */
    public void setRandomizeJobOptions(String opts) {
        m_randomizeJobOpts = opts;
    }

    /**
     * Get the options for the randomize/stratify task
     *
     * @return the options for the randomize task
     */
    public String getRandomizedJobOptions() {
        return m_randomizeJobOpts;
    }

    /**
     * Get the options to the header job
     *
     * @return options to the header job
     */
    public String getCSVMapTaskOptions() {
        return m_wekaCsvToArffMapTaskOpts;
    }

    /**
     * Set the options to the header job
     *
     * @param opts options to the header job
     */
    public void setCSVMapTaskOptions(String opts) {
        m_wekaCsvToArffMapTaskOpts = opts;
    }

    @Override
    public Enumeration<Option> listOptions() {
        Vector<Option> result = new Vector<Option>();

        result.add(new Option("\tCreate data splits with the order of the input instances\n\t"
                + "shuffled randomly. Also stratifies the data if the class\n\t"
                + "is nominal. Works in conjunction with -min-slices; can\n\t"
                + "alternatively use -num-instances-per-slice.", "randomize", 0, "-randomize"));

        result.add(new Option(
                "\tName of output model file. Model will be\n\t"
                        + "written to output-path/k-means/model/<model name>",
                "model-file-name", 1, "-model-file-name <model-name>"));

        result.add(new Option("\tNumber of clusters to find (default = 2)", "num-clusters", 1,
                "-num-clusters <integer>"));

        result.add(new Option("\tMax number of k-means iterations (default = 20)", "num-iterations", 1,
                "-num-iterations <integer>"));

        result.add(new Option(
                "\tNumber of separately initialized runs of k-means to\n\t" + "perform in parallel (default = 1)",
                "num-runs", 1, "-num-runs <integer>"));

        result.add(
                new Option("\tTolerance for convergence (default = 1e-4)", "tolerance", 1, "-tolerance <double>"));

        result.add(new Option("\tInitialize with randomly selected centroids instead\n\t"
                + "of running k-means|| initialization.", "init-random", 0, "-init-random"));

        result.add(new Option("\tThe number of k-means|| initialization iterations to perform\n\t"
                + "if initializing with k-means||. (default = 5).", "init-kmeans-its", 0, "-init-random"));

        result.add(new Option("\tDisplay std. deviations for centroids", "V", 0, "-V"));

        result.add(new Option("\tRandom seed (default 1).", "seed", 1, "-seed <integer>"));

        KMeansMapTask tempMapTask = new KMeansMapTask();
        Enumeration<Option> mapOpts = tempMapTask.listOptions();
        while (mapOpts.hasMoreElements()) {
            result.add(mapOpts.nextElement());
        }

        result.add(new Option("", "", 0, "\nOptions specific to data randomization/stratification:"));
        RandomizedDataSparkJob tempRJob = new RandomizedDataSparkJob();
        Enumeration<Option> randOpts = tempRJob.listOptions();
        while (randOpts.hasMoreElements()) {
            result.add(randOpts.nextElement());
        }

        return result.elements();
    }

    /**
     * Get the options for this job only
     * 
     * @return the options for this job only
     */
    public String[] getJobOptionsOnly() {
        List<String> options = new ArrayList<String>();

        options.add("-model-file-name");
        options.add(getModelFileName());

        if (getRandomlyShuffleData()) {
            options.add("-randomize");
        }

        if (getInitWithRandomCentroids()) {
            options.add("-init-random");
        }

        if (getDisplayCentroidStdDevs()) {
            options.add("-V");
        }

        options.add("-num-clusters");
        options.add(getNumClusters());
        options.add("-num-iterations");
        options.add(getNumIterations());
        options.add("-num-runs");
        options.add(getNumRuns());
        options.add("-init-kmeans-its");
        options.add(getKMeansParallelInitSteps());
        options.add("-seed");
        options.add(getRandomSeed());
        options.add("-tolerance");
        options.add("" + getConvergenceTolerance());

        return options.toArray(new String[options.size()]);
    }

    @Override
    public String[] getOptions() {
        List<String> options = new ArrayList<String>();

        for (String opt : getJobOptionsOnly()) {
            options.add(opt);
        }

        if (!DistributedJobConfig.isEmpty(getKMeansMapTaskOpts())) {
            try {
                String[] kMeansOpts = Utils.splitOptions(getKMeansMapTaskOpts());
                for (String s : kMeansOpts) {
                    options.add(s);
                }
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }

        if (!DistributedJobConfig.isEmpty(getCSVMapTaskOptions())) {
            try {
                String[] csvOpts = Utils.splitOptions(getCSVMapTaskOptions());

                for (String s : csvOpts) {
                    options.add(s);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        if (!DistributedJobConfig.isEmpty(getRandomizedJobOptions())) {
            try {
                String[] csvOpts = Utils.splitOptions(getRandomizedJobOptions());

                for (String s : csvOpts) {
                    options.add(s);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return options.toArray(new String[options.size()]);
    }

    @Override
    public void setOptions(String[] options) throws Exception {
        String modelFileName = Utils.getOption("model-file-name", options);
        if (!DistributedJobConfig.isEmpty(modelFileName)) {
            setModelFileName(modelFileName);
        }

        setRandomlyShuffleData(Utils.getFlag("randomize", options));

        setInitWithRandomCentroids(Utils.getFlag("init-random", options));

        String temp = Utils.getOption("num-clusters", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setNumClusters(temp);
        }
        temp = Utils.getOption("num-iterations", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setNumIterations(temp);
        }
        temp = Utils.getOption("num-runs", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setNumRuns(temp);
        }
        temp = Utils.getOption("init-kmeans-its", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setKMeansParallelInitSteps(temp);
        }

        temp = Utils.getOption("seed", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setRandomSeed(temp);
        }
        temp = Utils.getOption("tolerance", options);
        if (!DistributedJobConfig.isEmpty(temp)) {
            setConvergenceTolerance(Double.parseDouble(temp));
        }

        setDisplayCentroidStdDevs(Utils.getFlag('V', options));

        KMeansMapTask tempKTask = new KMeansMapTask();
        tempKTask.setOptions(options);
        String mapOpts = Utils.joinOptions(tempKTask.getOptions());
        setKMeansMapTaskOpts(mapOpts);

        String[] optionsCopy = options.clone();

        super.setOptions(options);

        // options for the randomize job
        m_randomizeSparkJob.setOptions(optionsCopy.clone());
        String optsToRandomize = Utils.joinOptions(m_randomizeSparkJob.getOptions());
        if (!DistributedJobConfig.isEmpty(optsToRandomize)) {
            setRandomizeJobOptions(optsToRandomize);
        }

        // options for the ARFF header job
        m_arffHeaderJob.setOptions(optionsCopy);
        String optsToCSVTask = Utils.joinOptions(m_arffHeaderJob.getOptions());
        if (!DistributedJobConfig.isEmpty(optsToCSVTask)) {
            setCSVMapTaskOptions(optsToCSVTask);
        }
    }

    /**
     * Perform an iteration of k-means
     *
     * @param dataset the dataset to operate on
     * @param mapTasks the underlying map tasks to use - one for each separate run
     *          of k-means that we're doing in parallel
     * @param converged array indicating which runs have converged
     * @param iterationNum the iteration number that we're up to
     * @param transformedHeaderNoSummary the header of the training data (sans
     *          summary attributes)
     * @return a list of KMeansReduceTasks encapsulating the results of the
     *         iteration for each active run of k-means
     * @throws DistributedWekaException if a problem occurs
     */
    protected List<Tuple2<Integer, KMeansReduceTask>> performKMeansIteration(JavaRDD<Instance> dataset,
            final KMeansMapTask[] mapTasks, final boolean[] converged, final int iterationNum,
            final Instances transformedHeaderNoSummary) throws DistributedWekaException {

        final int numRuns = mapTasks.length;

        // keyed by run, a list of partial centroid summary instances
        // - one Instances object for each centroid (may be null if a
        // given centroid did not get any instances assigned to it)
        JavaPairRDD<Integer, List<Instances>> mapRuns = dataset
                .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, List<Instances>>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = 6063661312796545915L;

                    protected List<Tuple2<Integer, List<Instances>>> m_centroidStatsForRuns = new ArrayList<Tuple2<Integer, List<Instances>>>();

                    @Override
                    public Iterable<Tuple2<Integer, List<Instances>>> call(Iterator<Instance> split)
                            throws DistributedWekaException {

                        while (split.hasNext()) {
                            Instance current = split.next();

                            for (int k = 0; k < numRuns; k++) {
                                if (!converged[k]) {
                                    mapTasks[k].processInstance(current);
                                }
                            }
                        }

                        for (int k = 0; k < numRuns; k++) {
                            if (!converged[k]) {
                                List<Instances> centroidStatsForRun = mapTasks[k].getCentroidStats();
                                m_centroidStatsForRuns
                                        .add(new Tuple2<Integer, List<Instances>>(k, centroidStatsForRun));
                            }
                        }

                        return m_centroidStatsForRuns;
                    }
                }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns))
                .persist(StorageLevel.MEMORY_AND_DISK());

        mapRuns.count();

        // Reduce. Need to aggregate all the cluster stats
        // for each run. Do we repartition into numRuns partitions and then
        // run another mapPartitions phase? With our custom partitioner this
        // should guarantee that a partition only contains the lists of instances
        // for one run. Can't use partitionByKey because CSVReduce is not
        // associative, and needs to see the whole list of summary instances
        // objects for one run, cluster# (need to run a separate reduce for
        // each cluster centroid within each run anyway). Then update the
        // final error for each centroid in each run and the total error
        // (sum of errors over centroids for a run)
        JavaPairRDD<Integer, KMeansReduceTask> reducedByRun = mapRuns.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, List<Instances>>>, Integer, KMeansReduceTask>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = -747645603149767637L;

                    protected List<Tuple2<Integer, KMeansReduceTask>> m_resultsForRun = new ArrayList<Tuple2<Integer, KMeansReduceTask>>();

                    @Override
                    public Iterable<Tuple2<Integer, KMeansReduceTask>> call(
                            Iterator<Tuple2<Integer, List<Instances>>> split) throws DistributedWekaException {

                        List<List<Instances>> partialsForRun = new ArrayList<List<Instances>>();
                        int runNumber = -1;

                        while (split.hasNext()) {
                            Tuple2<Integer, List<Instances>> partial = split.next();
                            if (runNumber < 0) {
                                runNumber = partial._1().intValue();
                            } else {
                                if (partial._1().intValue() != runNumber) {
                                    throw new DistributedWekaException("[k-means] reduce phase: "
                                            + "was not expecting the run number to change within a "
                                            + "partition!");
                                }
                            }

                            partialsForRun.add(partial._2());
                        }

                        KMeansReduceTask reducer = new KMeansReduceTask();

                        // size might be zero if we are operating on a partition for a
                        // run that has already converged (in which case there will be no
                        // data in this partition)...
                        if (partialsForRun.size() > 0) {
                            reducer.reduceClusters(runNumber, iterationNum, transformedHeaderNoSummary,
                                    partialsForRun);
                            m_resultsForRun.add(new Tuple2<Integer, KMeansReduceTask>(runNumber, reducer));
                        }

                        return m_resultsForRun;
                    }
                });

        List<Tuple2<Integer, KMeansReduceTask>> runResults = reducedByRun.collect();
        mapRuns.unpersist();
        reducedByRun.unpersist();

        return runResults;
    }

    /**
     * Performs the k-means iterations for all runs in parallel
     *
     * @param dataset the dataset to find clusters on
     * @param headerWithSummary the header of the training data (including summary
     *          attributes)
     * @param numIterations the maximum number of iterations to perform
     * @param numRuns the number of separate runs of k-means to perform in
     *          parallel. The run with the smallest within cluster error becomes
     *          the final model
     * @param numClusters the number of clusters to find
     * @return the final clusterer
     * @throws IOException if a problem occurs
     * @throws DistributedWekaException if a problem occurs
     */
    protected Clusterer buildClusterer(JavaRDD<Instance> dataset, Instances headerWithSummary, int numIterations,
            final int numRuns, final int numClusters) throws IOException, DistributedWekaException {

        final Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);
        m_trainingHeader = headerNoSummary;

        Instances tmpTrans = null;
        // one configured task per run
        final KMeansMapTask[] mapTasks = new KMeansMapTask[numRuns];
        for (int i = 0; i < numRuns; i++) {
            mapTasks[i] = new KMeansMapTask();

            try {
                mapTasks[i].setOptions(Utils.splitOptions(getKMeansMapTaskOpts()));
            } catch (Exception e) {
                throw new DistributedWekaException(e);
            }
            tmpTrans = mapTasks[i].init(headerWithSummary);
        }

        // header sans summary attributes after it has been through
        // any filters that the user has specified. This format
        // is needed by the KMeansReduceTask so that it can obtain
        // correct indexes for nominal values
        final Instances transformedHeaderNoSummary = tmpTrans;

        // initial centers
        List<Instances> centers = null;
        if (getInitWithRandomCentroids()) {
            centers = initializeWithRandomCenters(dataset, headerWithSummary, numRuns, numClusters);
        } else {
            centers = initializeWithKMeansParallel(dataset, headerWithSummary, numRuns, numClusters);
        }
        if (getDebug()) {
            for (int i = 0; i < numRuns; i++) {
                logMessage("[k-means] k-means" + (getInitWithRandomCentroids() ? "" : "||") + " start points Run "
                        + i + ":\n" + centers.get(i));
            }
        }

        final boolean[] converged = new boolean[numRuns];
        double[] convergenceSqErr = new double[numRuns];
        int[] numItsPerformed = new int[numRuns];
        int numConverged = 0;

        // make a copy of the initial starting points
        List<Instances> initialCenters = new ArrayList<Instances>(centers);

        List<Tuple2<Integer, KMeansReduceTask>> runResults;
        KMeansReduceTask bestResult = null;
        int bestRunNum = -1;
        int finalNumIterations = -1;
        for (int i = 0; i < numIterations; i++) {

            final int iterationNum = i;

            // initialize each run's map task with it's respective current
            // cluster centers
            for (int j = 0; j < numRuns; j++) {
                mapTasks[j].setCentroids(centers.get(j));
            }

            // perform an k-means iteration
            runResults = performKMeansIteration(dataset, mapTasks, converged, iterationNum,
                    transformedHeaderNoSummary);

            // create new center list
            for (Tuple2<Integer, KMeansReduceTask> r : runResults) {
                int run = r._1().intValue();

                if (converged[run]) {
                    continue;
                }

                KMeansReduceTask runRes = r._2();

                // if we've just finished the first iteration
                if (i == 0) {
                    mapTasks[run].setDummyDistancePrimingData(runRes.getGlobalDistanceFunctionPrimingData());
                }

                Instances newCentersForRun = runRes.getCentroidsForRun();

                if (getDebug()) {
                    logMessage("[k-means] centers for run " + run + " iteration: " + (i + 1) + "\n"
                            + newCentersForRun);
                    logMessage(
                            "[k-means] Total within cluster error: " + runRes.getTotalWithinClustersError() + "\n");
                }

                if (i < numIterations - 1) {
                    // check for convergence - if we dropped a centroid (because it became
                    // empty) then we'll check for convergence in the next iteration
                    if (newCentersForRun.numInstances() == centers.get(run).numInstances()) {
                        boolean changed = false;
                        double totalDist = 0;
                        for (int k = 0; k < newCentersForRun.numInstances(); k++) {
                            double dist = mapTasks[run].distance(newCentersForRun.instance(k),
                                    centers.get(run).instance(k));
                            if (m_debug) {
                                logMessage("[k-means] Run " + run + " convergence distance: " + dist);
                            }
                            totalDist += dist;
                            if (dist > m_convergenceTolerance) {
                                changed = true;
                            }
                        }

                        if (!changed) {
                            logMessage("[k-means] Run: " + run + " converged in " + (i + 1) + " iterations.");
                            List<Instances> centroidSummaries = runRes.getAggregatedCentroidSummaries();
                            if (m_debug) {
                                for (Instances sum : centroidSummaries) {
                                    System.err.println(sum);
                                }
                            }
                            converged[run] = true;
                            convergenceSqErr[run] = runRes.getTotalWithinClustersError();
                            numItsPerformed[run] = i + 1;
                            numConverged++;

                            if (bestResult == null) {
                                bestResult = runRes;
                                bestRunNum = run;
                                finalNumIterations = bestResult.getIterationNumber();
                            } else {
                                if (runRes.getTotalWithinClustersError() < bestResult
                                        .getTotalWithinClustersError()) {
                                    bestResult = runRes;
                                    bestRunNum = run;
                                    finalNumIterations = bestResult.getIterationNumber();
                                }
                            }
                        } else if (i > 2 && bestResult != null) {
                            // try to stop slowly converging runs - that will probably
                            // never beat the current best - from dragging the job out
                            double remainingIts = numIterations - i;

                            // TODO should probably keep a running average of the
                            // improvement in squared error per run
                            double projectedImprovement = remainingIts * totalDist;
                            double currentSqErr = runRes.getTotalWithinClustersError();
                            if ((bestResult.getTotalWithinClustersError() + m_convergenceTolerance) < (currentSqErr
                                    - projectedImprovement)) {
                                if (getDebug()) {
                                    logMessage("[k-means] aborting run " + run
                                            + " as its current within clust. error (" + currentSqErr + ") "
                                            + "is unlikely to beat the current best run ("
                                            + bestResult.getTotalWithinClustersError() + ") within " + remainingIts
                                            + " iterations");
                                }

                                converged[run] = true;
                                convergenceSqErr[run] = currentSqErr;
                                numItsPerformed[run] = -(i + 1);
                                numConverged++;
                            }
                        }
                    }
                }

                centers.set(run, newCentersForRun);
            }

            // check for convergence of *all* remaining runs and break
            if (numConverged == numRuns || i == numIterations - 1) {
                // scan for best
                for (Tuple2<Integer, KMeansReduceTask> r : runResults) {
                    int run = r._1().intValue();
                    KMeansReduceTask runRes = r._2();

                    if (bestResult == null) {
                        bestResult = runRes;
                        bestRunNum = run;
                        finalNumIterations = bestResult.getIterationNumber();
                    } else {
                        if (runRes.getTotalWithinClustersError() < bestResult.getTotalWithinClustersError()) {
                            bestResult = runRes;
                            bestRunNum = run;
                            finalNumIterations = bestResult.getIterationNumber();
                        }
                    }
                }
                break;
            }
        }

        Clusterer finalClusterer = makeFinalClusterer(bestResult, mapTasks[0].getPreprocessingFilters(),
                initialCenters.get(bestRunNum), finalNumIterations);
        System.err.println(finalClusterer);

        if (numRuns > 1) {
            for (int i = 0; i < numRuns; i++) {
                System.err.println("Run " + i + ""
                        + (numItsPerformed[i] < 0 ? " halted after " + -numItsPerformed[i]
                                : " converged after " + numItsPerformed[i])
                        + " iterations. Within cluster sum of sq. err: " + convergenceSqErr[i]);
            }
        }

        return finalClusterer;
    }

    /**
     * Write a clusterer to the output directory
     *
     * @param finalClusterer the cluster to write
     * @param header the header of the data (sans summary attributes) used to
     *          train the clusterer
     * @param outputPath the output path to write to
     * @throws IOException if a problem occurs
     */
    protected void writeClustererToDestination(Clusterer finalClusterer, Instances header, String outputPath)
            throws IOException {
        OutputStream os = openFileForWrite(outputPath);
        ObjectOutputStream oos = null;

        try {
            BufferedOutputStream bos = new BufferedOutputStream(os);
            oos = new ObjectOutputStream(bos);

            oos.writeObject(finalClusterer);
            if (header != null) {
                oos.writeObject(header);
            }
        } finally {
            if (oos != null) {
                oos.flush();
                oos.close();
            }
        }
    }

    /**
     * Constructs the final clusterer object once iteration has completed. This
     * will encapsulate the results from the best run of k-means.
     *
     * @param best the results of the best run of k-means
     * @param preprocess any preprocessing filter(s) in play
     * @param initialStartingPoints the initial starting points for the best run
     *          of k-means
     * @param finalNumIterations the final number of iterations executed by the
     *          best run of k-means
     * @return the final clusterer object
     * @throws DistributedWekaException if a problem occurs
     */
    protected Clusterer makeFinalClusterer(KMeansReduceTask best, Filter preprocess,
            Instances initialStartingPoints, int finalNumIterations) throws DistributedWekaException {

        Clusterer finalClusterer = null;
        PreconstructedKMeans finalKMeans = new PreconstructedKMeans();
        // global priming data for the distance function (this will be in
        // the transformed space if we're using preprocessing filters)
        Instances globalPrimingData = best.getGlobalDistanceFunctionPrimingData();
        NormalizableDistance dist = new EuclideanDistance();
        dist.setInstances(globalPrimingData);
        finalKMeans.setClusterCentroids(best.getCentroidsForRun());
        finalKMeans.setFinalNumberOfIterations(finalNumIterations + 1);
        finalKMeans.setDisplayStdDevs(getDisplayCentroidStdDevs());
        if (initialStartingPoints != null) {
            finalKMeans.setInitialStartingPoints(initialStartingPoints);
        }
        try {
            finalKMeans.setDistanceFunction(dist);
            finalKMeans.setClusterStats(best.getAggregatedCentroidSummaries());
        } catch (Exception e) {
            logMessage(e);
            throw new DistributedWekaException(e);
        }

        if (!getInitWithRandomCentroids()) {
            finalKMeans.setInitializationMethod(
                    new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
        }

        finalClusterer = finalKMeans;

        if (preprocess != null) {
            PreconstructedFilteredClusterer fc = new PreconstructedFilteredClusterer();
            fc.setFilter(preprocess);
            fc.setClusterer(finalKMeans);
            finalClusterer = fc;
        }

        return finalClusterer;
    }

    /**
     * Perform the k-means|| initialization process
     *
     * @param dataset the dataset to operate on
     * @param headerWithSummary the header of the data, with summary attributes
     * @param numRuns the number of separate runs of k-means to be performed in
     *          parallel
     * @param numClusters the number of clusters to generate
     * @return a list of Instances objects, where each Instances object contains
     *         the starting points for one run of k-means
     * @throws IOException if a problem occurs
     * @throws DistributedWekaException if a problem occurs
     */
    protected List<Instances> initializeWithKMeansParallel(JavaRDD<Instance> dataset, Instances headerWithSummary,
            final int numRuns, int numClusters) throws IOException, DistributedWekaException {

        int numSteps = Integer.parseInt(environmentSubstitute(getKMeansParallelInitSteps()));

        // random seed option
        int randomSeed = 1;
        if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
            try {
                randomSeed = Integer.parseInt(environmentSubstitute(getRandomSeed()));
            } catch (NumberFormatException ex) {
                // don't fuss
            }
        }

        // 1) start with 1 randomly chosen point for each run
        // 2) run sketch for x iterations (aggregating reservoirs for each
        // run at the end of each iteration (i.e. reservoirs for run 1
        // on each split of the data, reservoirs for run 2, etc.)
        // 3) Get final sketch for each run
        // 4) Weight each point in each sketch by the number of points
        // in the data that cluster to it
        // 5) Run local KMeans on data weighted data to obtain final k
        // starting centers

        // Step 1: start with 1 randomly chosen point for each run
        List<Instances> randomSingleCenters = initializeWithRandomCenters(dataset, headerWithSummary, numRuns, 1);

        // Step 2: run sketch for x iterations (aggregating reservoirs for each
        // run at the end of each iteration (i.e. reservoirs for run 1
        // on each split of the data, reservoirs for run 2, etc.)
        Instances tmpTrans = null;
        // one configured task per run (we'll use this for an initial distance
        // function and for step 4 where we need to cluster all the points to
        // get cluster sizes
        final KMeansMapTask[] mapTasks = new KMeansMapTask[numRuns];
        for (int i = 0; i < numRuns; i++) {
            mapTasks[i] = new KMeansMapTask();

            try {
                mapTasks[i].setOptions(Utils.splitOptions(getKMeansMapTaskOpts()));
            } catch (Exception e) {
                throw new DistributedWekaException(e);
            }
            tmpTrans = mapTasks[i].init(headerWithSummary);
        }

        // transformed header (has passed through filters)
        final Instances transformedHeaderNoSummary = tmpTrans;

        NormalizableDistance distanceFunc = mapTasks[0].getDistanceFunction();
        final CentroidSketch[] sketches = new CentroidSketch[numRuns];
        // initialize sketches
        for (int i = 0; i < numRuns; i++) {
            try {
                // apply any filters
                Instances transformedStartSketch = randomSingleCenters.get(i);
                // mapTasks[0].applyFilters(randomSingleCenters.get(i));

                sketches[i] = new CentroidSketch(transformedStartSketch, distanceFunc, 2 * numClusters,
                        randomSeed + i);
            } catch (Exception ex) {
                logMessage(ex);
                throw new DistributedWekaException(ex);
            }
        }

        // this is used when processing instances in partitions to
        // ensure that each instance from the data set gets
        // filtered appropriately
        final KMeansMapTask forFilteringOnly = mapTasks[0];

        for (int i = 0; i < numSteps; i++) {
            logMessage("[k-means] Running iteration " + (i + 1) + " of k-means|| initialization procedure.");
            final int iterationNum = i;

            // keyed by run, a list of partial sketches
            // - one CentroidSketch object for each run in each partition
            JavaPairRDD<Integer, CentroidSketch> mapRuns = dataset
                    .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, CentroidSketch>() {

                        /**
                         * For serialization
                         */
                        private static final long serialVersionUID = 6063661312796545915L;

                        protected List<Tuple2<Integer, CentroidSketch>> m_centroidSketchesForRuns = new ArrayList<Tuple2<Integer, CentroidSketch>>();

                        @Override
                        public Iterable<Tuple2<Integer, CentroidSketch>> call(Iterator<Instance> split)
                                throws DistributedWekaException {

                            while (split.hasNext()) {
                                Instance current = split.next();
                                try {
                                    // make sure it goes through any filters first!
                                    current = forFilteringOnly.applyFilters(current);
                                } catch (Exception ex) {
                                    throw new DistributedWekaException(ex);
                                }

                                for (int k = 0; k < numRuns; k++) {
                                    sketches[k].process(current, iterationNum == 0);
                                }
                            }

                            for (int k = 0; k < numRuns; k++) {
                                m_centroidSketchesForRuns.add(new Tuple2<Integer, CentroidSketch>(k, sketches[k]));

                            }

                            return m_centroidSketchesForRuns;
                        }
                    }).sortByKey().partitionBy(new IntegerKeyPartitioner(numRuns))
                    .persist(StorageLevel.MEMORY_AND_DISK());

            mapRuns.count();

            // Each partion of mapRuns now contains partials for just one run.
            // Here we aggregate the partials per run
            JavaPairRDD<Integer, CentroidSketch> reducedByRun = mapRuns.mapPartitionsToPair(
                    new PairFlatMapFunction<Iterator<Tuple2<Integer, CentroidSketch>>, Integer, CentroidSketch>() {

                        /** For serialization */
                        private static final long serialVersionUID = 7689178383188695493L;

                        protected List<Tuple2<Integer, CentroidSketch>> m_resultsForRun = new ArrayList<Tuple2<Integer, CentroidSketch>>();

                        @Override
                        public Iterable<Tuple2<Integer, CentroidSketch>> call(
                                Iterator<Tuple2<Integer, CentroidSketch>> split) throws DistributedWekaException {

                            int runNumber = -1;
                            CentroidSketch initial = null;
                            List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>();

                            while (split.hasNext()) {
                                Tuple2<Integer, CentroidSketch> partial = split.next();
                                if (runNumber < 0) {
                                    runNumber = partial._1().intValue();
                                } else {
                                    if (partial._1().intValue() != runNumber) {
                                        throw new DistributedWekaException("[k-means] k-means|| initialization: "
                                                + "was not expecting the run number to change within "
                                                + "a partition!");
                                    }
                                }

                                if (initial == null) {
                                    initial = partial._2();
                                } else {
                                    try {
                                        initial.aggregateReservoir(partial._2().getReservoirSample());
                                    } catch (Exception e) {
                                        throw new DistributedWekaException(e);
                                    }
                                }

                                // get all the distance functions and
                                // compute priming data that has global
                                // min and maxes.
                                if (iterationNum == 0) {
                                    // only need to determine global distance function
                                    // priming data once (i.e. in the first iteration of
                                    // the k-means|| process)
                                    distsForRun.add(partial._2().getDistanceFunction());
                                }
                            }

                            // update the distance function with global numeric
                            // attribute ranges
                            if (distsForRun.size() > 0) {
                                Instances distancePrimingData = KMeansReduceTask
                                        .computeDistancePrimingDataFromDistanceFunctions(distsForRun,
                                                transformedHeaderNoSummary);
                                initial.getDistanceFunction().setInstances(distancePrimingData);
                            }

                            m_resultsForRun.add(new Tuple2<Integer, CentroidSketch>(runNumber, initial));

                            return m_resultsForRun;
                        }
                    });

            List<Tuple2<Integer, CentroidSketch>> runResults = reducedByRun.collect();
            mapRuns.unpersist();
            mapRuns = null;

            for (Tuple2<Integer, CentroidSketch> r : runResults) {
                int runNum = r._1().intValue();
                sketches[runNum] = r._2();

                // add the current contents of the reservoir to the sketch
                // for each run
                try {
                    sketches[runNum].addReservoirToCurrentSketch();

                    if (m_debug) {
                        logMessage("[k-means] Iteration: " + i + " - number of instances in sketch: "
                                + sketches[runNum].getCurrentSketch().numInstances() + "\n"
                                + sketches[runNum].getCurrentSketch());
                    }
                } catch (Exception ex) {
                    logMessage(ex);
                    throw new DistributedWekaException(ex);
                }
            }
            reducedByRun.unpersist();
        }

        // perform and aggregate clustering using the final sketch results
        // so that we can find out how many points are assigned to
        // each instance in the sketch.
        Instances globalPriming = sketches[0].getDistanceFunction().getInstances();
        if (globalPriming.numInstances() != 2) {
            logMessage("[k-means] Error: as expecting a two instance "
                    + "(global priming data) dataset to be set in the distance function " + "in each sketch!");
            throw new DistributedWekaException("Was expecting a two instance (global priming data)"
                    + " dataset to be set in the distance function in each sketch!");
        }
        for (int i = 0; i < numRuns; i++) {
            // set sketches as centers for map tasks
            // in preparation for clustering (so that we can)
            // find out how many training points get assigned to
            // each center

            mapTasks[i].setCentroids(sketches[i].getCurrentSketch());
            mapTasks[i].setDummyDistancePrimingData(globalPriming);
        }

        // 3 & 4) Get final sketch for each run and weight each point in
        // the sketch by the number of training instances that cluster to it
        List<Tuple2<Integer, KMeansReduceTask>> clusterAssignments = performKMeansIteration(dataset, mapTasks,
                new boolean[numRuns], 1, transformedHeaderNoSummary);

        List<Instances> finalStartPointsForRuns = new ArrayList<Instances>();
        for (int i = 0; i < numRuns; i++) {
            int rN = clusterAssignments.get(i)._1().intValue();
            List<Instances> centroidSummaries = clusterAssignments.get(i)._2().getAggregatedCentroidSummaries();

            Instances sketchForRun = sketches[i].getCurrentSketch();

            // empty clusters shouldn't be a problem - in
            // one iteration each sketch member should at minimum
            // have itself assigned (i.e. count >= 1). NOTE: The only exception
            // could occur if the sketch contains duplicate instances. However,
            // this shouldn't happen within a single WeightedReservoirSampling
            // as candidate instances with weight 0 (i.e. distance 0 to the sketch
            // in this case) are never added to the sketch.
            if (centroidSummaries.size() != sketchForRun.numInstances()) {
                logMessage("[k-means] Error: was expecting as " + "many summary headers as \n"
                        + "there are center candidates in the sketch for run " + rN);
                throw new DistributedWekaException("Was expecting as many summary headers as "
                        + "there are center candidates in the sketch for run " + rN);
            }

            for (int j = 0; j < sketchForRun.numInstances(); j++) {
                Instance centerCandidate = sketchForRun.instance(j);
                Instances centerStats = centroidSummaries.get(j);
                double weightForCandidate = -1.0;
                // now grab the first summary attribute and get count
                for (int k = 0; k < sketchForRun.numAttributes(); k++) {

                    if (sketchForRun.attribute(k).isNumeric()) {
                        Attribute statsAtt = centerStats
                                .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                                        + sketchForRun.attribute(k).name());
                        weightForCandidate = ArffSummaryNumericMetric.COUNT.valueFromAttribute(statsAtt)
                                + ArffSummaryNumericMetric.MISSING.valueFromAttribute(statsAtt);
                        break;
                    } else if (sketchForRun.attribute(k).isNominal()) {
                        Attribute statsAtt = centerStats
                                .attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
                                        + sketchForRun.attribute(k).name());
                        NominalStats ns = NominalStats.attributeToStats(statsAtt);
                        weightForCandidate = 0;
                        for (String s : ns.getLabels()) {
                            weightForCandidate += ns.getCount(s);
                        }
                        weightForCandidate += ns.getNumMissing();
                    }
                }

                if (weightForCandidate < 0) {
                    logMessage("[k-means] Error: unable to compute the " + "number of training instances "
                            + "assigned to sketch member " + j + " in run " + i);
                    throw new DistributedWekaException("Unable to compute the number of training instances "
                            + "assigned to sketch member " + j + " in run " + i);
                }

                // finally - set the weight
                centerCandidate.setWeight(weightForCandidate);
            }

            if (m_debug) {
                logMessage("Final weighted sketch (run " + i + ") prior to local KMeans:\n" + sketchForRun);
            }

            // now run standard k-means on the weighted sketch to
            // (hopefully) get the requested number of start points
            SimpleKMeans localKMeans = new SimpleKMeans();
            try {
                localKMeans.setNumClusters(numClusters);
                localKMeans.setInitializationMethod(
                        new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
                localKMeans.buildClusterer(sketchForRun);
                finalStartPointsForRuns.add(localKMeans.getClusterCentroids());
            } catch (Exception ex) {
                logMessage(ex);
                throw new DistributedWekaException(ex);
            }
        }

        m_distanceFunctionPrimingData = globalPriming;

        return finalStartPointsForRuns;
    }

    /**
     * Initialize by randomly selecting instances from the dataset
     *
     * @param dataset the dataset to operate on
     * @param headerWithSummary the header of the data, with summary attributes
     * @param numRuns the number of runs of k-means to perform in parallel
     * @param numClusters the number of clusters to find
     * @return a list of Instances objects, where each Instances object contains
     *         the randomly selected start points for one run of k-means
     * @throws IOException if a problem occurs
     * @throws DistributedWekaException if a problem occurs
     */
    protected List<Instances> initializeWithRandomCenters(JavaRDD<Instance> dataset, Instances headerWithSummary,
            int numRuns, int numClusters) throws IOException, DistributedWekaException {

        Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);

        // sample all runs worth of initial centers in one hit
        // take twice as many as needed in case there are duplicates
        int seed = 1;
        if (!DistributedJobConfig.isEmpty(getRandomSeed())) {
            try {
                seed = Integer.parseInt(environmentSubstitute(getRandomSeed()));
            } catch (NumberFormatException e) {
                // don't complain
            }
        }

        // oversample for > 1 cluster per run, so that we have some options if there
        // are duplicates in the list. numClusters == 1 will be used when seeding
        // the k-means|| initialization process
        int oversampleFactor = numClusters > 1 ? 2 : 1;
        List<Instance> centerList = dataset.takeSample(true, oversampleFactor * numRuns * numClusters, seed);

        // make sure that start points and header have been through any filters
        KMeansMapTask forFilteringOnly = new KMeansMapTask();
        try {
            forFilteringOnly.setOptions(Utils.splitOptions(environmentSubstitute(getKMeansMapTaskOpts())));

            // initialize sketches
            headerNoSummary = forFilteringOnly.init(headerWithSummary);

            for (int i = 0; i < centerList.size(); i++) {
                Instance filtered = forFilteringOnly.applyFilters(centerList.get(i));
                centerList.set(i, filtered);
            }

        } catch (Exception ex) {
            logMessage(ex);
            throw new DistributedWekaException(ex);
        }

        List<Instances> centreCandidates = KMeansMapTask.assignStartPointsFromList(numRuns, numClusters, centerList,
                headerNoSummary);

        return centreCandidates;
    }

    @Override
    public boolean runJobWithContext(JavaSparkContext sparkContext) throws IOException, DistributedWekaException {

        m_currentContext = sparkContext;
        setJobStatus(JobStatus.RUNNING);
        boolean success = true;

        if (m_env == null) {
            m_env = Environment.getSystemWide();
        }

        JavaRDD<Instance> dataSet = null;
        Instances headerWithSummary = null;
        if (getDataset(TRAINING_DATA) != null) {
            dataSet = ((Dataset<Instance>) getDataset(TRAINING_DATA)).getDataset();
            headerWithSummary = getDataset(TRAINING_DATA).getHeaderWithSummary();
            logMessage("[k-means] RDD<Instance> dataset provided: " + dataSet.partitions().size() + " partitions.");
        }

        if (dataSet == null && headerWithSummary == null) {
            logMessage("[k-means] invoking ARFF Job...");
            m_arffHeaderJob.setEnvironment(m_env);
            m_arffHeaderJob.setLog(getLog());
            m_arffHeaderJob.setStatusMessagePrefix(m_statusMessagePrefix);
            m_arffHeaderJob.setCachingStrategy(getCachingStrategy());

            // header job necessary?
            success = m_arffHeaderJob.runJobWithContext(sparkContext);
            if (!success) {
                setJobStatus(JobStatus.FAILED);
                statusMessage("Unable to continue - creating the ARFF header failed!");
                logMessage("[k-means] unable to continue - creating the ARFF header failed!");
                return false;
            }

            Dataset<Instance> d = (Dataset<Instance>) m_arffHeaderJob.getDataset(TRAINING_DATA);
            headerWithSummary = d.getHeaderWithSummary();
            dataSet = d.getDataset();
            setDataset(TRAINING_DATA, new Dataset<Instance>(dataSet, headerWithSummary));
        }

        int numClusters = 2;
        if (!DistributedJobConfig.isEmpty(getNumClusters())) {
            try {
                numClusters = Integer.parseInt(environmentSubstitute(getNumClusters()));
            } catch (NumberFormatException e) {
                // ignore
            }
        }

        int numRuns = 1;
        if (!DistributedJobConfig.isEmpty(getNumRuns())) {
            try {
                numRuns = Integer.parseInt(environmentSubstitute(getNumRuns()));
            } catch (NumberFormatException e) {
                // ignore
            }
        }

        int numIterations = 20;
        if (!DistributedJobConfig.isEmpty(getNumIterations())) {
            try {
                numIterations = Integer.parseInt(environmentSubstitute(getNumIterations()));
            } catch (NumberFormatException e) {
                // ignore
            }
        }

        Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);

        // Make sure that we save out to a subdirectory of the output
        // directory
        String outputPath = environmentSubstitute(m_sjConfig.getOutputDir());
        outputPath = addSubdirToPath(outputPath, OUTPUT_SUBDIR);

        Clusterer finalClusterer = null;
        // serialized input is assumed to already be randomized...
        if (getRandomlyShuffleData() /* && !getSerializedInput() */) {
            m_randomizeSparkJob.setDefaultToLastAttIfClassNotSpecified(false);
            m_randomizeSparkJob.setEnvironment(m_env);
            m_randomizeSparkJob.setLog(getLog());
            m_randomizeSparkJob.setStatusMessagePrefix(m_statusMessagePrefix);
            m_randomizeSparkJob.setCachingStrategy(getCachingStrategy());
            m_randomizeSparkJob.setDataset(TRAINING_DATA, new Dataset(dataSet, headerWithSummary));

            if (!m_randomizeSparkJob.runJobWithContext(sparkContext)) {
                statusMessage("Unable to continue - random shuffling of " + "input data failed!");
                logMessage("[k-means] ynable to continue - random shuffling of input " + "data failed!");
                return false;
            }

            Dataset<Instance> d = (Dataset<Instance>) m_randomizeSparkJob.getDataset(TRAINING_DATA);
            dataSet = d.getDataset();
            headerWithSummary = d.getHeaderWithSummary();
            setDataset(TRAINING_DATA, new Dataset<Instance>(dataSet, headerWithSummary));
            // m_dataSet = randomized;
        }

        m_finalClusterer = buildClusterer(dataSet, headerWithSummary, numIterations, numRuns, numClusters);
        // pass on the model (in case EM clustering is being executed downstream
        // from us)
        getDataset(TRAINING_DATA).setAdditionalDataElement(K_MEANS_MODEL, m_finalClusterer);

        outputPath += (outputPath.toLowerCase().contains("://") ? "/" : File.separator) + getModelFileName();
        writeClustererToDestination(m_finalClusterer, headerNoSummary, outputPath);

        setJobStatus(JobStatus.FINISHED);
        return success;
    }

    @Override
    public Clusterer getClusterer() {
        return m_finalClusterer;
    }

    @Override
    public Instances getTrainingHeader() {
        return m_trainingHeader;
    }

    @Override
    public String getText() {
        return m_finalClusterer != null ? m_finalClusterer.toString() : "Clusterer not built yet!";
    }

    @Override
    public void run(Object toRun, String[] options) throws IllegalArgumentException {
        if (!(toRun instanceof KMeansClustererSparkJob)) {
            throw new IllegalArgumentException("Object to run is not a KMeansClustererSparkJob!");
        }

        try {
            KMeansClustererSparkJob kcsj = (KMeansClustererSparkJob) toRun;
            if (Utils.getFlag('h', options)) {
                String help = DistributedJob.makeOptionsStr(kcsj);
                System.err.println(help);
                System.exit(1);
            }

            kcsj.setOptions(options);
            kcsj.runJob();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }
}