Example usage for org.apache.mahout.common.commandline DefaultOptionCreator methodOption

List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator methodOption

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator methodOption.

Prototype

public static DefaultOptionBuilder methodOption() 

Source Link

Document

Returns a default command line option for specification of sequential or parallel operation.

Usage

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addOptions();/*from ww w.  j  a va2  s . c o m*/
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Map<String, String> options = parseOptions();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD)
            .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
        runSequential(getConf(), getInputPath(), output, options);
    } else {
        runMapReduce(getInputPath(), output);
    }

    return 0;
}

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

/**
 * Override this method in order to add additional options to the command
 * line of the SequenceFileFromDirectory job. Do not forget to call super()
 * otherwise all standard options (input/output dirs etc) will not be
 * available./*from   w  w w. j a v  a  2  s  .  com*/
 */
protected void addOptions() {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
    addOption(FILE_FILTER_CLASS_OPTION[0], FILE_FILTER_CLASS_OPTION[1],
            "The name of the class to use for file parsing. Default: " + PREFIX_ADDITION_FILTER,
            PREFIX_ADDITION_FILTER);
    addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
    addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
            "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Configure this instance based on the command-line arguments contained within provided array.
 * Calls {@link #validate()} to ensure consistency of configuration.
 *
 * @return true if the arguments were parsed successfully and execution should proceed.
 * @throws Exception if there is a problem parsing the command-line arguments or the particular
 *                   combination would violate class invariants.
 *///  w  w w.  java2  s .  co m
private boolean parseArgs(String[] args) throws Exception {

    addInputOption();
    addOption("trainingOutput", "tr", "The training data output directory", false);
    addOption("testOutput", "te", "The test data output directory", false);
    addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
    addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
    addOption("splitLocation", "sl",
            "Location for start of test data expressed as a percentage of the input file "
                    + "size (0=start, 50=middle, 100=end",
            false);
    addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
    addOption("randomSelectionPct", "rp",
            "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false);
    addOption("charset", "c",
            "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)",
            false);
    addOption(buildOption("sequenceFiles", "seq",
            "Set if the input files are sequence files.  Default is false", false, false, "false"));
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    //TODO: extend this to sequential mode
    addOption("keepPct", "k",
            "The percentage of total data to keep in map-reduce mode, the rest will be ignored.  "
                    + "Default is 100%",
            false);
    addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);

    if (parseArguments(args) == null) {
        return false;
    }

    try {
        inputDirectory = getInputPath();

        useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION)
                .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD);

        if (useMapRed) {
            if (!hasOption("randomSelectionPct")) {
                throw new OptionException(getCLIOption("randomSelectionPct"),
                        "must set randomSelectionPct when mapRed option is used");
            }
            if (!hasOption("mapRedOutputDir")) {
                throw new OptionException(getCLIOption("mapRedOutputDir"),
                        "mapRedOutputDir must be set when mapRed option is used");
            }
            mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
            if (hasOption("keepPct")) {
                keepPct = Integer.parseInt(getOption("keepPct"));
            }
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(getConf(), mapRedOutputDirectory);
            }
        } else {
            if (!hasOption("trainingOutput") || !hasOption("testOutput")) {
                throw new OptionException(getCLIOption("trainingOutput"),
                        "trainingOutput and testOutput must be set if mapRed option is not used");
            }
            if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct")
                    && !hasOption("randomSelectionSize")) {
                throw new OptionException(getCLIOption("testSplitSize"),
                        "must set one of test split size/percentage or randomSelectionSize/percentage");
            }

            trainingOutputDirectory = new Path(getOption("trainingOutput"));
            testOutputDirectory = new Path(getOption("testOutput"));
            FileSystem fs = trainingOutputDirectory.getFileSystem(getConf());
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(fs.getConf(), trainingOutputDirectory);
                HadoopUtil.delete(fs.getConf(), testOutputDirectory);
            }
            fs.mkdirs(trainingOutputDirectory);
            fs.mkdirs(testOutputDirectory);
        }

        if (hasOption("charset")) {
            charset = Charset.forName(getOption("charset"));
        }

        if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
            throw new OptionException(getCLIOption("testSplitPct"),
                    "must have either split size or split percentage " + "option, not BOTH");
        }

        if (hasOption("testSplitSize")) {
            setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
        }

        if (hasOption("testSplitPct")) {
            setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
        }

        if (hasOption("splitLocation")) {
            setSplitLocation(Integer.parseInt(getOption("splitLocation")));
        }

        if (hasOption("randomSelectionSize")) {
            setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
        }

        if (hasOption("randomSelectionPct")) {
            setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
        }

        useSequence = hasOption("sequenceFiles");

    } catch (OptionException e) {
        log.error("Command-line option Exception", e);
        CommandLineUtil.printHelp(getGroup());
        return false;
    }

    validate();
    return true;
}

From source file:com.twitter.algebra.matrix.multiply.MultiplicationDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   w ww .  j a  v  a 2s  .  c  o  m*/
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(BPATH, "bPath", "path to matrix B");
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(PARTITIONSOPTION, "parts", "Number of partitions in principal components");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final Path bPath = new Path(getOption(BPATH));
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int nColPartitions = Integer.parseInt(getOption(PARTITIONSOPTION));

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }

    run(conf, input, bPath, output, nRows, nCols, nPCs, nColPartitions);
    return 0;
}

From source file:com.twitter.algebra.nmf.NMFDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();// w  w  w .  ja  va  2  s . co  m
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(PARTITIONSOPTION, "parts", "Number of partitions in principal components");
    addOption(SAMPLE_RATE, SAMPLE_RATE, "sample rate for error calculation");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int nColPartitions = Integer.parseInt(getOption(PARTITIONSOPTION));

    alpha1 = Float.parseFloat(getOption(ALPHA1, "0.01"));
    alpha2 = Float.parseFloat(getOption(ALPHA2, "1"));
    lambda1 = Float.parseFloat(getOption(LAMBDA1, "0.01"));
    lambda2 = Float.parseFloat(getOption(LAMBDA2, "0"));

    sampleRate = Float.parseFloat(getOption(SAMPLE_RATE, "0.0001f"));

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }
    MIN_ERROR_CHANGE = conf.getLong(MIN_ERROR_CHANGE_STR, Long.MAX_VALUE);
    MAX_ROUNDS = conf.getInt(MAX_ROUNDS_STR, 100);

    run(conf, input, output, nRows, nCols, nPCs, nColPartitions);
    return 0;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   ww  w .  j a v a 2  s.  c o m*/
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.clustersInOption()
            .withDescription(
                    "The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
                            + "If k is also specified, then a random set of vectors will be selected"
                            + " and written out to this path first")
            .create());
    addOption(DefaultOptionCreator.numClustersOption()
            .withDescription(
                    "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
                            + " as the Centroid and written to the clusters input path.")
            .create());
    addOption(DefaultOptionCreator.convergenceOption().create());
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption(DefaultOptionCreator.clusteringOption().create());
    addOption(DefaultOptionCreator.methodOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Path input = getInputPath();
    Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
    Path output = getOutputPath();
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    if (measureClass == null) {
        measureClass = SquaredEuclideanDistanceMeasure.class.getName();
    }
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    Configuration conf = getConf();
    // clustersIn is used as host file
    MemCachedUtil.configHelper(conf, clusters.toUri().getPath());
    int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
    MemKMeansUtil.kmeansConfigHelper(conf, k);

    // create the seeds
    log.info("Create seeds.");
    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
        MemRandomSeedGenerator.buildRandom(getConf(), input,
                Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
    }
    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
            .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
    if (getConf() == null) {
        setConf(new Configuration());
    }

    // run iteration
    run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
            runSequential);
    return 0;
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * The sampling rate that is used for computing the reconstruction error
 *//*from  w  w  w  .  j  a v  a 2s  .  c o m*/

@Override
public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(SPLITFACTOROPTION, "sf", "Split each block to increase paralelism");
    addOption(ERRSAMPLE, "errSampleRate", "Sampling rate for computing the error (0-1]");
    addOption(MAXITER, "maxIter", "Maximum number of iterations before terminating, the default is 3");
    addOption(NORMALIZEOPTION, "normalize",
            "Choose whether you want the input matrix to be  normalized or not, 1 means normalize, 0 means don't normalize");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int splitFactor;
    final int normalize;
    final int maxIterations;
    final float errSampleRate;
    if (hasOption(SPLITFACTOROPTION))
        splitFactor = Integer.parseInt(getOption(SPLITFACTOROPTION, "1"));
    else
        splitFactor = 1;
    if (hasOption(ERRSAMPLE))
        errSampleRate = Float.parseFloat(getOption(ERRSAMPLE));
    else {
        int length = String.valueOf(nRows).length();
        if (length <= 4)
            errSampleRate = 1;
        else
            errSampleRate = (float) (1 / Math.pow(10, length - 4));
        log.warn("error sampling rate set to:  errRate=" + errSampleRate);
    }

    if (hasOption(MAXITER))
        maxIterations = Integer.parseInt(getOption(MAXITER));
    else
        maxIterations = 3;
    if (hasOption(NORMALIZEOPTION))
        normalize = Integer.parseInt(getOption(NORMALIZEOPTION));
    else
        normalize = 0;

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }
    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
            .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
    run(conf, input, output, nRows, nCols, nPCs, splitFactor, errSampleRate, maxIterations, normalize,
            runSequential);
    return 0;
}