Example usage for org.apache.mahout.common.commandline DefaultOptionCreator methodOption

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator methodOption.

Prototype

public static DefaultOptionBuilder methodOption()

Source Link

Document

Returns a default command line option for specification of sequential or parallel operation.

Usage

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addOptions();/*from ww w.  j  a va2  s . c o m*/
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Map<String, String> options = parseOptions();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD)
            .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
        runSequential(getConf(), getInputPath(), output, options);
    } else {
        runMapReduce(getInputPath(), output);
    }

    return 0;
}

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

/**
 * Override this method in order to add additional options to the command
 * line of the SequenceFileFromDirectory job. Do not forget to call super()
 * otherwise all standard options (input/output dirs etc) will not be
 * available./*from   w  w w. j a v  a  2  s  .  com*/
 */
protected void addOptions() {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
    addOption(FILE_FILTER_CLASS_OPTION[0], FILE_FILTER_CLASS_OPTION[1],
            "The name of the class to use for file parsing. Default: " + PREFIX_ADDITION_FILTER,
            PREFIX_ADDITION_FILTER);
    addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
    addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
            "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Configure this instance based on the command-line arguments contained within provided array.
 * Calls {@link #validate()} to ensure consistency of configuration.
 *
 * @return true if the arguments were parsed successfully and execution should proceed.
 * @throws Exception if there is a problem parsing the command-line arguments or the particular
 *                   combination would violate class invariants.
 *///  w  w w.  java2  s .  co m
private boolean parseArgs(String[] args) throws Exception {

    addInputOption();
    addOption("trainingOutput", "tr", "The training data output directory", false);
    addOption("testOutput", "te", "The test data output directory", false);
    addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
    addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
    addOption("splitLocation", "sl",
            "Location for start of test data expressed as a percentage of the input file "
                    + "size (0=start, 50=middle, 100=end",
            false);
    addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
    addOption("randomSelectionPct", "rp",
            "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false);
    addOption("charset", "c",
            "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)",
            false);
    addOption(buildOption("sequenceFiles", "seq",
            "Set if the input files are sequence files.  Default is false", false, false, "false"));
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    //TODO: extend this to sequential mode
    addOption("keepPct", "k",
            "The percentage of total data to keep in map-reduce mode, the rest will be ignored.  "
                    + "Default is 100%",
            false);
    addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);

    if (parseArguments(args) == null) {
        return false;
    }

    try {
        inputDirectory = getInputPath();

        useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION)
                .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD);

        if (useMapRed) {
            if (!hasOption("randomSelectionPct")) {
                throw new OptionException(getCLIOption("randomSelectionPct"),
                        "must set randomSelectionPct when mapRed option is used");
            }
            if (!hasOption("mapRedOutputDir")) {
                throw new OptionException(getCLIOption("mapRedOutputDir"),
                        "mapRedOutputDir must be set when mapRed option is used");
            }
            mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
            if (hasOption("keepPct")) {
                keepPct = Integer.parseInt(getOption("keepPct"));
            }
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(getConf(), mapRedOutputDirectory);
            }
        } else {
            if (!hasOption("trainingOutput") || !hasOption("testOutput")) {
                throw new OptionException(getCLIOption("trainingOutput"),
                        "trainingOutput and testOutput must be set if mapRed option is not used");
            }
            if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct")
                    && !hasOption("randomSelectionSize")) {
                throw new OptionException(getCLIOption("testSplitSize"),
                        "must set one of test split size/percentage or randomSelectionSize/percentage");
            }

            trainingOutputDirectory = new Path(getOption("trainingOutput"));
            testOutputDirectory = new Path(getOption("testOutput"));
            FileSystem fs = trainingOutputDirectory.getFileSystem(getConf());
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(fs.getConf(), trainingOutputDirectory);
                HadoopUtil.delete(fs.getConf(), testOutputDirectory);
            }
            fs.mkdirs(trainingOutputDirectory);
            fs.mkdirs(testOutputDirectory);
        }

        if (hasOption("charset")) {
            charset = Charset.forName(getOption("charset"));
        }

        if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
            throw new OptionException(getCLIOption("testSplitPct"),
                    "must have either split size or split percentage " + "option, not BOTH");
        }

        if (hasOption("testSplitSize")) {
            setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
        }

        if (hasOption("testSplitPct")) {
            setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
        }

        if (hasOption("splitLocation")) {
            setSplitLocation(Integer.parseInt(getOption("splitLocation")));
        }

        if (hasOption("randomSelectionSize")) {
            setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
        }

        if (hasOption("randomSelectionPct")) {
            setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
        }

        useSequence = hasOption("sequenceFiles");

    } catch (OptionException e) {
        log.error("Command-line option Exception", e);
        CommandLineUtil.printHelp(getGroup());
        return false;
    }

    validate();
    return true;
}

From source file:com.twitter.algebra.matrix.multiply.MultiplicationDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   w ww .  j a  v  a 2s  .  c  o  m*/
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(BPATH, "bPath", "path to matrix B");
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(PARTITIONSOPTION, "parts", "Number of partitions in principal components");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final Path bPath = new Path(getOption(BPATH));
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int nColPartitions = Integer.parseInt(getOption(PARTITIONSOPTION));

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }

    run(conf, input, bPath, output, nRows, nCols, nPCs, nColPartitions);
    return 0;
}

From source file:com.twitter.algebra.nmf.NMFDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();// w  w  w .  ja  va  2  s . co  m
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(PARTITIONSOPTION, "parts", "Number of partitions in principal components");
    addOption(SAMPLE_RATE, SAMPLE_RATE, "sample rate for error calculation");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int nColPartitions = Integer.parseInt(getOption(PARTITIONSOPTION));

    alpha1 = Float.parseFloat(getOption(ALPHA1, "0.01"));
    alpha2 = Float.parseFloat(getOption(ALPHA2, "1"));
    lambda1 = Float.parseFloat(getOption(LAMBDA1, "0.01"));
    lambda2 = Float.parseFloat(getOption(LAMBDA2, "0"));

    sampleRate = Float.parseFloat(getOption(SAMPLE_RATE, "0.0001f"));

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }
    MIN_ERROR_CHANGE = conf.getLong(MIN_ERROR_CHANGE_STR, Long.MAX_VALUE);
    MAX_ROUNDS = conf.getInt(MAX_ROUNDS_STR, 100);

    run(conf, input, output, nRows, nCols, nPCs, nColPartitions);
    return 0;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   ww  w .  j a v a 2  s.  c o m*/
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.clustersInOption()
            .withDescription(
                    "The input centroids, as Vectors.  Must be a SequenceFile of Writable, Cluster/Canopy.  "
                            + "If k is also specified, then a random set of vectors will be selected"
                            + " and written out to this path first")
            .create());
    addOption(DefaultOptionCreator.numClustersOption()
            .withDescription(
                    "The k in k-Means.  If specified, then a random selection of k Vectors will be chosen"
                            + " as the Centroid and written to the clusters input path.")
            .create());
    addOption(DefaultOptionCreator.convergenceOption().create());
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption(DefaultOptionCreator.clusteringOption().create());
    addOption(DefaultOptionCreator.methodOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Path input = getInputPath();
    Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
    Path output = getOutputPath();
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    if (measureClass == null) {
        measureClass = SquaredEuclideanDistanceMeasure.class.getName();
    }
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    Configuration conf = getConf();
    // clustersIn is used as host file
    MemCachedUtil.configHelper(conf, clusters.toUri().getPath());
    int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
    MemKMeansUtil.kmeansConfigHelper(conf, k);

    // create the seeds
    log.info("Create seeds.");
    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
        MemRandomSeedGenerator.buildRandom(getConf(), input,
                Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
    }
    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
            .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
    if (getConf() == null) {
        setConf(new Configuration());
    }

    // run iteration
    run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering,
            runSequential);
    return 0;
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * The sampling rate that is used for computing the reconstruction error
 *//*from  w  w  w  .  j  a v  a 2s  .  c o m*/

@Override
public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(ROWSOPTION, "rows", "Number of rows");
    addOption(COLSOPTION, "cols", "Number of cols");
    addOption(PRINCIPALSOPTION, "pcs", "Number of principal components");
    addOption(SPLITFACTOROPTION, "sf", "Split each block to increase paralelism");
    addOption(ERRSAMPLE, "errSampleRate", "Sampling rate for computing the error (0-1]");
    addOption(MAXITER, "maxIter", "Maximum number of iterations before terminating, the default is 3");
    addOption(NORMALIZEOPTION, "normalize",
            "Choose whether you want the input matrix to be  normalized or not, 1 means normalize, 0 means don't normalize");
    if (parseArguments(args) == null) {
        return -1;
    }
    Path input = getInputPath();
    Path output = getOutputPath();
    final int nRows = Integer.parseInt(getOption(ROWSOPTION));
    final int nCols = Integer.parseInt(getOption(COLSOPTION));
    final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION));
    final int splitFactor;
    final int normalize;
    final int maxIterations;
    final float errSampleRate;
    if (hasOption(SPLITFACTOROPTION))
        splitFactor = Integer.parseInt(getOption(SPLITFACTOROPTION, "1"));
    else
        splitFactor = 1;
    if (hasOption(ERRSAMPLE))
        errSampleRate = Float.parseFloat(getOption(ERRSAMPLE));
    else {
        int length = String.valueOf(nRows).length();
        if (length <= 4)
            errSampleRate = 1;
        else
            errSampleRate = (float) (1 / Math.pow(10, length - 4));
        log.warn("error sampling rate set to:  errRate=" + errSampleRate);
    }

    if (hasOption(MAXITER))
        maxIterations = Integer.parseInt(getOption(MAXITER));
    else
        maxIterations = 3;
    if (hasOption(NORMALIZEOPTION))
        normalize = Integer.parseInt(getOption(NORMALIZEOPTION));
    else
        normalize = 0;

    Configuration conf = getConf();
    if (conf == null) {
        throw new IOException("No Hadoop configuration present");
    }
    boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION)
            .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
    run(conf, input, output, nRows, nCols, nPCs, splitFactor, errSampleRate, maxIterations, normalize,
            runSequential);
    return 0;
}