Example usage for org.apache.mahout.common.commandline DefaultOptionCreator SEQUENTIAL_METHOD

List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator SEQUENTIAL_METHOD

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator SEQUENTIAL_METHOD.

Prototype

String SEQUENTIAL_METHOD

To view the source code for org.apache.mahout.common.commandline DefaultOptionCreator SEQUENTIAL_METHOD.

Click Source Link

Usage

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addOptions();/*from  ww w  .j av  a 2  s.  co m*/
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Map<String, String> options = parseOptions();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD)
            .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
        runSequential(getConf(), getInputPath(), output, options);
    } else {
        runMapReduce(getInputPath(), output);
    }

    return 0;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from w ww . j  a  v a2s  .c  o  m
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.clustersInOption()
            .withDescription(
                    "The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
                            + "If k is also specified, then a random set of vectors will be selected"
                            + " and written out to this path first")
            .create());
    addOption(DefaultOptionCreator.numClustersOption()
            .withDescription(
                    "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
                            + " as the Centroid and written to the clusters input path.")
            .create());
    addOption(DefaultOptionCreator.convergenceOption().create());
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption(DefaultOptionCreator.clusteringOption().create());
    addOption(DefaultOptionCreator.methodOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Path input = getInputPath();
    Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
    Path output = getOutputPath();
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    if (measureClass == null) {
        measureClass = SquaredEuclideanDistanceMeasure.class.getName();
    }
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    Configuration conf = getConf();
    // clustersIn is used as host file
    MemCachedUtil.configHelper(conf, clusters.toUri().getPath());
    int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
    MemKMeansUtil.kmeansConfigHelper(conf, k);

    // create the seeds
    log.info("Create seeds.");
    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
        MemRandomSeedGenerator.buildRandom(getConf(), input,
                Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
    }
    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
            .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
    if (getConf() == null) {
        setConf(new Configuration());
    }

    // run iteration
    run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
            runSequential);
    return 0;
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * The sampling rate that is used for computing the reconstruction error
 *///from ww w . ja  va2 s.c o  m

@Override
public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(SPLITFACTOROPTION, "sf", "Split each block to increase paralelism");
    addOption(ERRSAMPLE, "errSampleRate", "Sampling rate for computing the error (0-1]");
    addOption(MAXITER, "maxIter", "Maximum number of iterations before terminating, the default is 3");
    addOption(NORMALIZEOPTION, "normalize",
            "Choose whether you want the input matrix to be  normalized or not, 1 means normalize, 0 means don't normalize");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int splitFactor;
    final int normalize;
    final int maxIterations;
    final float errSampleRate;
    if (hasOption(SPLITFACTOROPTION))
        splitFactor = Integer.parseInt(getOption(SPLITFACTOROPTION, "1"));
    else
        splitFactor = 1;
    if (hasOption(ERRSAMPLE))
        errSampleRate = Float.parseFloat(getOption(ERRSAMPLE));
    else {
        int length = String.valueOf(nRows).length();
        if (length <= 4)
            errSampleRate = 1;
        else
            errSampleRate = (float) (1 / Math.pow(10, length - 4));
        log.warn("error sampling rate set to:  errRate=" + errSampleRate);
    }

    if (hasOption(MAXITER))
        maxIterations = Integer.parseInt(getOption(MAXITER));
    else
        maxIterations = 3;
    if (hasOption(NORMALIZEOPTION))
        normalize = Integer.parseInt(getOption(NORMALIZEOPTION));
    else
        normalize = 0;

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }
    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
            .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
    run(conf, input, output, nRows, nCols, nPCs, splitFactor, errSampleRate, maxIterations, normalize,
            runSequential);
    return 0;
}