List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator methodOption
public static DefaultOptionBuilder methodOption()
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOptions();/*from ww w. j a va2 s . c o m*/ addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); if (parseArguments(args) == null) { return -1; } Map<String, String> options = parseOptions(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD) .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) { runSequential(getConf(), getInputPath(), output, options); } else { runMapReduce(getInputPath(), output); } return 0; }
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
/** * Override this method in order to add additional options to the command * line of the SequenceFileFromDirectory job. Do not forget to call super() * otherwise all standard options (input/output dirs etc) will not be * available./*from w w w. j a v a 2 s . com*/ */ protected void addOptions() { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.methodOption().create()); addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64"); addOption(FILE_FILTER_CLASS_OPTION[0], FILE_FILTER_CLASS_OPTION[1], "The name of the class to use for file parsing. Default: " + PREFIX_ADDITION_FILTER, PREFIX_ADDITION_FILTER); addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", ""); addOption(CHARSET_OPTION[0], CHARSET_OPTION[1], "The name of the character encoding of the input files. Default to UTF-8", "UTF-8"); }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Configure this instance based on the command-line arguments contained within provided array. * Calls {@link #validate()} to ensure consistency of configuration. * * @return true if the arguments were parsed successfully and execution should proceed. * @throws Exception if there is a problem parsing the command-line arguments or the particular * combination would violate class invariants. */// w w w. java2 s . co m private boolean parseArgs(String[] args) throws Exception { addInputOption(); addOption("trainingOutput", "tr", "The training data output directory", false); addOption("testOutput", "te", "The test data output directory", false); addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false); addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false); addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file " + "size (0=start, 50=middle, 100=end", false); addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false); addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false); addOption("charset", "c", "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)", false); addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files. Default is false", false, false, "false")); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); //TODO: extend this to sequential mode addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored. " + "Default is 100%", false); addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false); if (parseArguments(args) == null) { return false; } try { inputDirectory = getInputPath(); useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD); if (useMapRed) { if (!hasOption("randomSelectionPct")) { throw new OptionException(getCLIOption("randomSelectionPct"), "must set randomSelectionPct when mapRed option is used"); } if (!hasOption("mapRedOutputDir")) { throw new OptionException(getCLIOption("mapRedOutputDir"), "mapRedOutputDir must be set when mapRed option is used"); } mapRedOutputDirectory = new Path(getOption("mapRedOutputDir")); if (hasOption("keepPct")) { keepPct = Integer.parseInt(getOption("keepPct")); } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), mapRedOutputDirectory); } } else { if (!hasOption("trainingOutput") || !hasOption("testOutput")) { throw new OptionException(getCLIOption("trainingOutput"), "trainingOutput and testOutput must be set if mapRed option is not used"); } if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct") && !hasOption("randomSelectionSize")) { throw new OptionException(getCLIOption("testSplitSize"), "must set one of test split size/percentage or randomSelectionSize/percentage"); } trainingOutputDirectory = new Path(getOption("trainingOutput")); testOutputDirectory = new Path(getOption("testOutput")); FileSystem fs = trainingOutputDirectory.getFileSystem(getConf()); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(fs.getConf(), trainingOutputDirectory); HadoopUtil.delete(fs.getConf(), testOutputDirectory); } fs.mkdirs(trainingOutputDirectory); fs.mkdirs(testOutputDirectory); } if (hasOption("charset")) { charset = Charset.forName(getOption("charset")); } if (hasOption("testSplitSize") && hasOption("testSplitPct")) { throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage " + "option, not BOTH"); } if (hasOption("testSplitSize")) { setTestSplitSize(Integer.parseInt(getOption("testSplitSize"))); } if (hasOption("testSplitPct")) { setTestSplitPct(Integer.parseInt(getOption("testSplitPct"))); } if (hasOption("splitLocation")) { setSplitLocation(Integer.parseInt(getOption("splitLocation"))); } if (hasOption("randomSelectionSize")) { setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize"))); } if (hasOption("randomSelectionPct")) { setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct"))); } useSequence = hasOption("sequenceFiles"); } catch (OptionException e) { log.error("Command-line option Exception", e); CommandLineUtil.printHelp(getGroup()); return false; } validate(); return true; }
From source file:com.twitter.algebra.matrix.multiply.MultiplicationDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w ww . j a v a 2s . c o m*/ addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(BPATH, "bPath", "path to matrix B"); addOption(ROWSOPTION, "rows", "Number of rows"); addOption(COLSOPTION, "cols", "Number of cols"); addOption(PRINCIPALSOPTION, "pcs", "Number of principal components"); addOption(PARTITIONSOPTION, "parts", "Number of partitions in principal components"); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); final Path bPath = new Path(getOption(BPATH)); final int nRows = Integer.parseInt(getOption(ROWSOPTION)); final int nCols = Integer.parseInt(getOption(COLSOPTION)); final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION)); final int nColPartitions = Integer.parseInt(getOption(PARTITIONSOPTION)); Configuration conf = getConf(); if (conf == null) { throw new IOException("No Hadoop configuration present"); } run(conf, input, bPath, output, nRows, nCols, nPCs, nColPartitions); return 0; }
From source file:com.twitter.algebra.nmf.NMFDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();// w w w . ja va 2 s . co m addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(ROWSOPTION, "rows", "Number of rows"); addOption(COLSOPTION, "cols", "Number of cols"); addOption(PRINCIPALSOPTION, "pcs", "Number of principal components"); addOption(PARTITIONSOPTION, "parts", "Number of partitions in principal components"); addOption(SAMPLE_RATE, SAMPLE_RATE, "sample rate for error calculation"); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); final int nRows = Integer.parseInt(getOption(ROWSOPTION)); final int nCols = Integer.parseInt(getOption(COLSOPTION)); final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION)); final int nColPartitions = Integer.parseInt(getOption(PARTITIONSOPTION)); alpha1 = Float.parseFloat(getOption(ALPHA1, "0.01")); alpha2 = Float.parseFloat(getOption(ALPHA2, "1")); lambda1 = Float.parseFloat(getOption(LAMBDA1, "0.01")); lambda2 = Float.parseFloat(getOption(LAMBDA2, "0")); sampleRate = Float.parseFloat(getOption(SAMPLE_RATE, "0.0001f")); Configuration conf = getConf(); if (conf == null) { throw new IOException("No Hadoop configuration present"); } MIN_ERROR_CHANGE = conf.getLong(MIN_ERROR_CHANGE_STR, Long.MAX_VALUE); MAX_ROUNDS = conf.getInt(MAX_ROUNDS_STR, 100); run(conf, input, output, nRows, nCols, nPCs, nColPartitions); return 0; }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w . j a v a 2 s. c o m*/ addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription( "The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first") .create()); addOption(DefaultOptionCreator.numClustersOption() .withDescription( "The k in k-Means. If specified, then a random selection of k Vectors will be chosen" + " as the Centroid and written to the clusters input path.") .create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(DefaultOptionCreator.methodOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); Configuration conf = getConf(); // clustersIn is used as host file MemCachedUtil.configHelper(conf, clusters.toUri().getPath()); int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); MemKMeansUtil.kmeansConfigHelper(conf, k); // create the seeds log.info("Create seeds."); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { MemRandomSeedGenerator.buildRandom(getConf(), input, Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure); } boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); if (getConf() == null) { setConf(new Configuration()); } // run iteration run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential); return 0; }
From source file:org.qcri.pca.SPCADriver.java
/** * The sampling rate that is used for computing the reconstruction error *//*from w w w . j a v a 2s . c o m*/ @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(ROWSOPTION, "rows", "Number of rows"); addOption(COLSOPTION, "cols", "Number of cols"); addOption(PRINCIPALSOPTION, "pcs", "Number of principal components"); addOption(SPLITFACTOROPTION, "sf", "Split each block to increase paralelism"); addOption(ERRSAMPLE, "errSampleRate", "Sampling rate for computing the error (0-1]"); addOption(MAXITER, "maxIter", "Maximum number of iterations before terminating, the default is 3"); addOption(NORMALIZEOPTION, "normalize", "Choose whether you want the input matrix to be normalized or not, 1 means normalize, 0 means don't normalize"); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); final int nRows = Integer.parseInt(getOption(ROWSOPTION)); final int nCols = Integer.parseInt(getOption(COLSOPTION)); final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION)); final int splitFactor; final int normalize; final int maxIterations; final float errSampleRate; if (hasOption(SPLITFACTOROPTION)) splitFactor = Integer.parseInt(getOption(SPLITFACTOROPTION, "1")); else splitFactor = 1; if (hasOption(ERRSAMPLE)) errSampleRate = Float.parseFloat(getOption(ERRSAMPLE)); else { int length = String.valueOf(nRows).length(); if (length <= 4) errSampleRate = 1; else errSampleRate = (float) (1 / Math.pow(10, length - 4)); log.warn("error sampling rate set to: errRate=" + errSampleRate); } if (hasOption(MAXITER)) maxIterations = Integer.parseInt(getOption(MAXITER)); else maxIterations = 3; if (hasOption(NORMALIZEOPTION)) normalize = Integer.parseInt(getOption(NORMALIZEOPTION)); else normalize = 0; Configuration conf = getConf(); if (conf == null) { throw new IOException("No Hadoop configuration present"); } boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); run(conf, input, output, nRows, nCols, nPCs, splitFactor, errSampleRate, maxIterations, normalize, runSequential); return 0; }