List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator METHOD_OPTION
String METHOD_OPTION
To view the source code for org.apache.mahout.common.commandline DefaultOptionCreator METHOD_OPTION.
Click Source Link
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOptions();// w w w .j a v a 2 s .c o m addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); if (parseArguments(args) == null) { return -1; } Map<String, String> options = parseOptions(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD) .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) { runSequential(getConf(), getInputPath(), output, options); } else { runMapReduce(getInputPath(), output); } return 0; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Configure this instance based on the command-line arguments contained within provided array. * Calls {@link #validate()} to ensure consistency of configuration. * * @return true if the arguments were parsed successfully and execution should proceed. * @throws Exception if there is a problem parsing the command-line arguments or the particular * combination would violate class invariants. *///from w ww . ja v a 2s. co m private boolean parseArgs(String[] args) throws Exception { addInputOption(); addOption("trainingOutput", "tr", "The training data output directory", false); addOption("testOutput", "te", "The test data output directory", false); addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false); addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false); addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file " + "size (0=start, 50=middle, 100=end", false); addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false); addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false); addOption("charset", "c", "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)", false); addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files. Default is false", false, false, "false")); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); //TODO: extend this to sequential mode addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored. " + "Default is 100%", false); addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false); if (parseArguments(args) == null) { return false; } try { inputDirectory = getInputPath(); useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD); if (useMapRed) { if (!hasOption("randomSelectionPct")) { throw new OptionException(getCLIOption("randomSelectionPct"), "must set randomSelectionPct when mapRed option is used"); } if (!hasOption("mapRedOutputDir")) { throw new OptionException(getCLIOption("mapRedOutputDir"), "mapRedOutputDir must be set when mapRed option is used"); } mapRedOutputDirectory = new Path(getOption("mapRedOutputDir")); if (hasOption("keepPct")) { keepPct = Integer.parseInt(getOption("keepPct")); } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), mapRedOutputDirectory); } } else { if (!hasOption("trainingOutput") || !hasOption("testOutput")) { throw new OptionException(getCLIOption("trainingOutput"), "trainingOutput and testOutput must be set if mapRed option is not used"); } if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct") && !hasOption("randomSelectionSize")) { throw new OptionException(getCLIOption("testSplitSize"), "must set one of test split size/percentage or randomSelectionSize/percentage"); } trainingOutputDirectory = new Path(getOption("trainingOutput")); testOutputDirectory = new Path(getOption("testOutput")); FileSystem fs = trainingOutputDirectory.getFileSystem(getConf()); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(fs.getConf(), trainingOutputDirectory); HadoopUtil.delete(fs.getConf(), testOutputDirectory); } fs.mkdirs(trainingOutputDirectory); fs.mkdirs(testOutputDirectory); } if (hasOption("charset")) { charset = Charset.forName(getOption("charset")); } if (hasOption("testSplitSize") && hasOption("testSplitPct")) { throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage " + "option, not BOTH"); } if (hasOption("testSplitSize")) { setTestSplitSize(Integer.parseInt(getOption("testSplitSize"))); } if (hasOption("testSplitPct")) { setTestSplitPct(Integer.parseInt(getOption("testSplitPct"))); } if (hasOption("splitLocation")) { setSplitLocation(Integer.parseInt(getOption("splitLocation"))); } if (hasOption("randomSelectionSize")) { setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize"))); } if (hasOption("randomSelectionPct")) { setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct"))); } useSequence = hasOption("sequenceFiles"); } catch (OptionException e) { log.error("Command-line option Exception", e); CommandLineUtil.printHelp(getGroup()); return false; } validate(); return true; }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();// w w w. j a va 2s .c om addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription( "The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first") .create()); addOption(DefaultOptionCreator.numClustersOption() .withDescription( "The k in k-Means. If specified, then a random selection of k Vectors will be chosen" + " as the Centroid and written to the clusters input path.") .create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(DefaultOptionCreator.methodOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); Configuration conf = getConf(); // clustersIn is used as host file MemCachedUtil.configHelper(conf, clusters.toUri().getPath()); int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); MemKMeansUtil.kmeansConfigHelper(conf, k); // create the seeds log.info("Create seeds."); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { MemRandomSeedGenerator.buildRandom(getConf(), input, Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure); } boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); if (getConf() == null) { setConf(new Configuration()); } // run iteration run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential); return 0; }
From source file:org.qcri.pca.SPCADriver.java
/** * The sampling rate that is used for computing the reconstruction error */// ww w . j a v a 2 s .c o m @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(ROWSOPTION, "rows", "Number of rows"); addOption(COLSOPTION, "cols", "Number of cols"); addOption(PRINCIPALSOPTION, "pcs", "Number of principal components"); addOption(SPLITFACTOROPTION, "sf", "Split each block to increase paralelism"); addOption(ERRSAMPLE, "errSampleRate", "Sampling rate for computing the error (0-1]"); addOption(MAXITER, "maxIter", "Maximum number of iterations before terminating, the default is 3"); addOption(NORMALIZEOPTION, "normalize", "Choose whether you want the input matrix to be normalized or not, 1 means normalize, 0 means don't normalize"); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); final int nRows = Integer.parseInt(getOption(ROWSOPTION)); final int nCols = Integer.parseInt(getOption(COLSOPTION)); final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION)); final int splitFactor; final int normalize; final int maxIterations; final float errSampleRate; if (hasOption(SPLITFACTOROPTION)) splitFactor = Integer.parseInt(getOption(SPLITFACTOROPTION, "1")); else splitFactor = 1; if (hasOption(ERRSAMPLE)) errSampleRate = Float.parseFloat(getOption(ERRSAMPLE)); else { int length = String.valueOf(nRows).length(); if (length <= 4) errSampleRate = 1; else errSampleRate = (float) (1 / Math.pow(10, length - 4)); log.warn("error sampling rate set to: errRate=" + errSampleRate); } if (hasOption(MAXITER)) maxIterations = Integer.parseInt(getOption(MAXITER)); else maxIterations = 3; if (hasOption(NORMALIZEOPTION)) normalize = Integer.parseInt(getOption(NORMALIZEOPTION)); else normalize = 0; Configuration conf = getConf(); if (conf == null) { throw new IOException("No Hadoop configuration present"); } boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); run(conf, input, output, nRows, nCols, nPCs, splitFactor, errSampleRate, maxIterations, normalize, runSequential); return 0; }