Example usage for org.apache.mahout.clustering.topdown.postprocessor ClusterOutputPostProcessorDriver run

List of usage examples for org.apache.mahout.clustering.topdown.postprocessor ClusterOutputPostProcessorDriver run

Introduction

In this page you can find the example usage for org.apache.mahout.clustering.topdown.postprocessor ClusterOutputPostProcessorDriver run.

Prototype

public static void run(Path input, Path output, boolean runSequential)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Post processes the output of clustering algorithms and groups them into respective clusters.

Usage

From source file:root.benchmark.ReutersBenchmarkJob.java

License:Apache License

/**
 * {@inheritDoc}/*  w w  w .  j a  v  a2s  .com*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    Path inputDirectoryPath = new Path(inputDirectory);
    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    Path workingDirectoryPath = new Path(workingDirectory);
    if (workingFS.exists(workingDirectoryPath)) {
        throw new Exception("Working Directory already exists.");
    }
    if (!workingFS.mkdirs(workingDirectoryPath)) {
        throw new Exception("Failed to create Working Directory.");
    }

    String[] vectorizationArgs = { "-i", inputDirectory, "-o", workingDirectory + vectorizationOutputDirectory,
            "-x", exclusionThreshold, "-mdf", minimumDocumentFrequency, };
    System.out.println();
    ToolRunner.run(conf, new ReutersVectorizationJob(), vectorizationArgs);

    long starttime, stoptime, deltatime;
    starttime = System.currentTimeMillis();

    String[] canopyArgs = { "-i", workingDirectory + vectorizationOutputDirectory + "/vectorFiles/tf-vectors",
            "-o", workingDirectory + canopyOutputDirectory, "-dm", distanceMetric, "-t1", threshold1, "-t2",
            threshold2 };
    ToolRunner.run(conf, new CanopyJob(), canopyArgs);

    String[] kmeansArgs = { "-i", workingDirectory + vectorizationOutputDirectory + "/vectorFiles/tf-vectors",
            "-o", workingDirectory + kmeansOutputDirectory, "-c",
            workingDirectory + canopyOutputDirectory + "/clusters-0-final", "-dm", distanceMetric, "-cd",
            convergenceDelta, "-mIter", numIterations, "-k", "3" };
    ToolRunner.run(conf, new KMeansJob(), kmeansArgs);

    Path input = new Path(workingDirectory + kmeansOutputDirectory);
    Path output = new Path(workingDirectory + outputDirectory);
    boolean sequential = false;

    ClusterOutputPostProcessorDriver.run(input, output, sequential);

    stoptime = System.currentTimeMillis();
    deltatime = stoptime - starttime;

    conf.setStrings(CONF_PREFIX + "Dataset", "Image");
    conf.setStrings(CONF_PREFIX + "Iterations", numIterations);
    conf.setStrings(CONF_PREFIX + "Levels", convergenceDelta);
    writeTimestamp(conf, workingDirectory + timeStamp, deltatime);

    return 0;

}