List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator SEQUENTIAL_METHOD
String SEQUENTIAL_METHOD
To view the source code for org.apache.mahout.common.commandline DefaultOptionCreator SEQUENTIAL_METHOD.
Click Source Link
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOptions();/*from ww w .j av a 2 s. co m*/ addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); if (parseArguments(args) == null) { return -1; } Map<String, String> options = parseOptions(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD) .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) { runSequential(getConf(), getInputPath(), output, options); } else { runMapReduce(getInputPath(), output); } return 0; }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w ww . j a v a2s .c o m addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription( "The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy. " + "If k is also specified, then a random set of vectors will be selected" + " and written out to this path first") .create()); addOption(DefaultOptionCreator.numClustersOption() .withDescription( "The k in k-Means. If specified, then a random selection of k Vectors will be chosen" + " as the Centroid and written to the clusters input path.") .create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(DefaultOptionCreator.clusteringOption().create()); addOption(DefaultOptionCreator.methodOption().create()); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path clusters = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); Configuration conf = getConf(); // clustersIn is used as host file MemCachedUtil.configHelper(conf, clusters.toUri().getPath()); int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); MemKMeansUtil.kmeansConfigHelper(conf, k); // create the seeds log.info("Create seeds."); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { MemRandomSeedGenerator.buildRandom(getConf(), input, Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure); } boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); if (getConf() == null) { setConf(new Configuration()); } // run iteration run(getConf(), input, clusters, output, measure, convergenceDelta, maxIterations, runClustering, runSequential); return 0; }
From source file:org.qcri.pca.SPCADriver.java
/** * The sampling rate that is used for computing the reconstruction error *///from ww w . ja va2 s.c o m @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(ROWSOPTION, "rows", "Number of rows"); addOption(COLSOPTION, "cols", "Number of cols"); addOption(PRINCIPALSOPTION, "pcs", "Number of principal components"); addOption(SPLITFACTOROPTION, "sf", "Split each block to increase paralelism"); addOption(ERRSAMPLE, "errSampleRate", "Sampling rate for computing the error (0-1]"); addOption(MAXITER, "maxIter", "Maximum number of iterations before terminating, the default is 3"); addOption(NORMALIZEOPTION, "normalize", "Choose whether you want the input matrix to be normalized or not, 1 means normalize, 0 means don't normalize"); if (parseArguments(args) == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); final int nRows = Integer.parseInt(getOption(ROWSOPTION)); final int nCols = Integer.parseInt(getOption(COLSOPTION)); final int nPCs = Integer.parseInt(getOption(PRINCIPALSOPTION)); final int splitFactor; final int normalize; final int maxIterations; final float errSampleRate; if (hasOption(SPLITFACTOROPTION)) splitFactor = Integer.parseInt(getOption(SPLITFACTOROPTION, "1")); else splitFactor = 1; if (hasOption(ERRSAMPLE)) errSampleRate = Float.parseFloat(getOption(ERRSAMPLE)); else { int length = String.valueOf(nRows).length(); if (length <= 4) errSampleRate = 1; else errSampleRate = (float) (1 / Math.pow(10, length - 4)); log.warn("error sampling rate set to: errRate=" + errSampleRate); } if (hasOption(MAXITER)) maxIterations = Integer.parseInt(getOption(MAXITER)); else maxIterations = 3; if (hasOption(NORMALIZEOPTION)) normalize = Integer.parseInt(getOption(NORMALIZEOPTION)); else normalize = 0; Configuration conf = getConf(); if (conf == null) { throw new IOException("No Hadoop configuration present"); } boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD); run(conf, input, output, nRows, nCols, nPCs, splitFactor, errSampleRate, maxIterations, normalize, runSequential); return 0; }