Example usage for org.apache.mahout.common.commandline DefaultOptionCreator overwriteOption

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator overwriteOption.

Prototype

public static DefaultOptionBuilder overwriteOption()

Source Link

Document

Returns a default command line option for output directory overwriting.

Usage

From source file:chapter5.KMeanSample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();// w  w w . ja  v  a  2  s.c o  m
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.numClustersOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.convergenceOption().create());
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, String> argMap = parseArguments(args);
    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    if (measureClass == null) {
        measureClass = SquaredEuclideanDistanceMeasure.class.getName();
    }
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
        int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
        run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
    } else {
        double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
        double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
        run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
    }
    return 0;
}

From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from w  ww  .  jav  a2  s . co m*/
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(new Configuration(), output);
    }
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    run(input, output, measure, t1, t2);
    return 0;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//  w w  w.j  a v  a  2  s  .c om
    addOutputOption();
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value", "0");
    addOption(DefaultOptionCreator.overwriteOption().create());

    addOption(NUM_TOPICS, "k", "Number of topics to learn", true);
    addOption(NUM_TERMS, "nt", "Vocabulary size", false);
    addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution", "0.0001");
    addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution", "0.0001");
    addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false);
    addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false);
    addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false);
    addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check", "10");
    addOption(RANDOM_SEED, "seed", "Random seed", false);
    addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing", "0");
    addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with", "4");
    addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with", "1");
    addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning",
            "10");
    addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation", "10");
    addOption(buildOption(BACKFILL_PERPLEXITY, null, "enable backfilling of missing perplexity values", false,
            false, null));

    if (parseArguments(args) == null) {
        return -1;
    }

    int numTopics = Integer.parseInt(getOption(NUM_TOPICS));
    Path inputPath = getInputPath();
    Path topicModelOutputPath = getOutputPath();
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    int iterationBlockSize = Integer.parseInt(getOption(ITERATION_BLOCK_SIZE));
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    double alpha = Double.parseDouble(getOption(DOC_TOPIC_SMOOTHING));
    double eta = Double.parseDouble(getOption(TERM_TOPIC_SMOOTHING));
    int numTrainThreads = Integer.parseInt(getOption(NUM_TRAIN_THREADS));
    int numUpdateThreads = Integer.parseInt(getOption(NUM_UPDATE_THREADS));
    int maxItersPerDoc = Integer.parseInt(getOption(MAX_ITERATIONS_PER_DOC));
    Path dictionaryPath = hasOption(DICTIONARY) ? new Path(getOption(DICTIONARY)) : null;
    int numTerms = hasOption(NUM_TERMS) ? Integer.parseInt(getOption(NUM_TERMS))
            : getNumTerms(getConf(), dictionaryPath);
    Path docTopicOutputPath = hasOption(DOC_TOPIC_OUTPUT) ? new Path(getOption(DOC_TOPIC_OUTPUT)) : null;
    Path modelTempPath = hasOption(MODEL_TEMP_DIR) ? new Path(getOption(MODEL_TEMP_DIR))
            : getTempPath("topicModelState");
    long seed = hasOption(RANDOM_SEED) ? Long.parseLong(getOption(RANDOM_SEED)) : System.nanoTime() % 10000;
    float testFraction = hasOption(TEST_SET_FRACTION) ? Float.parseFloat(getOption(TEST_SET_FRACTION)) : 0.0f;
    int numReduceTasks = Integer.parseInt(getOption(NUM_REDUCE_TASKS));
    boolean backfillPerplexity = hasOption(BACKFILL_PERPLEXITY);

    return run(getConf(), inputPath, topicModelOutputPath, numTopics, numTerms, alpha, eta, maxIterations,
            iterationBlockSize, convergenceDelta, dictionaryPath, docTopicOutputPath, modelTempPath, seed,
            testFraction, numTrainThreads, numUpdateThreads, maxItersPerDoc, numReduceTasks,
            backfillPerplexity);
}

From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from   w w w  .j ava  2 s .c om
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.numClustersOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.convergenceOption().create());
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    if (measureClass == null) {
        measureClass = SquaredEuclideanDistanceMeasure.class.getName();
    }
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
        int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
        run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
    } else {
        double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
        double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
        run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
    }
    return 0;
}

From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from   w w w.j a va 2s .c  o m
    addOutputOption();
    addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
    addOption("model", "m", "The path to the model built during training", true);
    addOption(
            buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
    addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false)));
    addOption("labelIndex", "l", "The path to the location of the label index", true);
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    boolean complementary = hasOption("testComplementary");
    boolean sequential = hasOption("runSequential");
    if (sequential) {
        FileSystem fs = FileSystem.get(getConf());
        NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf());
        AbstractNaiveBayesClassifier classifier;
        if (complementary) {
            classifier = new ComplementaryNaiveBayesClassifier(model);
        } else {
            classifier = new StandardNaiveBayesClassifier(model);
        }
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class,
                VectorWritable.class);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, getInputPath(), getConf());
        Text key = new Text();
        VectorWritable vw = new VectorWritable();
        while (reader.next(key, vw)) {
            writer.append(new Text(SLASH.split(key.toString())[1]),
                    new VectorWritable(classifier.classifyFull(vw.get())));
        }
        writer.close();
        reader.close();
    } else {
        boolean succeeded = runMapReduce(parsedArgs);
        if (!succeeded) {
            return -1;
        }
    }

    //load the labels
    Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));

    //loop over the results and create the confusion matrix
    SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>(
            getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
    ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
    analyzeResults(labelMap, dirIterable, analyzer);

    log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer);
    return 0;
}

From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//w w  w. j  a v  a2 s.  c o m
    addOutputOption();
    addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
    addOption("model", "m", "The path to the model built during training", true);
    addOption(
            buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
    addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false)));
    addOption("labelIndex", "l", "The path to the location of the label index", true);
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    boolean complementary = hasOption("testComplementary");
    boolean sequential = hasOption("runSequential");
    if (sequential) {
        FileSystem fs = FileSystem.get(getConf());
        NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf());
        AbstractNaiveBayesClassifier classifier;
        if (complementary) {
            classifier = new ComplementaryNaiveBayesClassifier(model);
        } else {
            classifier = new StandardNaiveBayesClassifier(model);
        }
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class,
                VectorWritable.class);
        Reader reader = new Reader(fs, getInputPath(), getConf());
        Text key = new Text();
        VectorWritable vw = new VectorWritable();
        while (reader.next(key, vw)) {
            writer.append(new Text(SLASH.split(key.toString())[1]),
                    new VectorWritable(classifier.classifyFull(vw.get())));
        }
        writer.close();
        reader.close();
    } else {
        boolean succeeded = runMapReduce(parsedArgs);
        if (!succeeded) {
            return -1;
        }
    }

    //load the labels
    Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));

    //loop over the results and create the confusion matrix
    SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>(
            getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
    ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
    analyzeResults(labelMap, dirIterable, analyzer);

    log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer);
    return 0;
}

From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from   w  ww. j a v a 2 s .  c o  m*/
    addOutputOption();
    addOption(LABELS, "l", "comma-separated list of labels to include in training", false);

    addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, ""));
    addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f));
    addOption(
            buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false)));
    addOption(LABEL_INDEX, "li", "The path to store the label index in", false);
    addOption(DefaultOptionCreator.overwriteOption().create());
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
        HadoopUtil.delete(getConf(), getTempPath());
    }
    Path labPath;
    String labPathStr = getOption(LABEL_INDEX);
    if (labPathStr != null) {
        labPath = new Path(labPathStr);
    } else {
        labPath = getTempPath(LABEL_INDEX);
    }
    long labelSize = createLabelIndex(labPath);
    float alphaI = Float.parseFloat(getOption(ALPHA_I));
    boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY));

    HadoopUtil.setSerializations(getConf());
    HadoopUtil.cacheFiles(labPath, getConf());

    //add up all the vectors with the same labels, while mapping the labels into our index
    Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS),
            SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class,
            VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class);
    indexInstances.setCombinerClass(VectorSumReducer.class);
    boolean succeeded = indexInstances.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    //sum up all the weights from the previous step, per label and per feature
    Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS),
            SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize));
    weightSummer.setCombinerClass(VectorSumReducer.class);
    succeeded = weightSummer.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }

    //put the per label and per feature vectors into the cache
    HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());

    //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors --
    // TODO: add reference here to the part of the Rennie paper that discusses this
    Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS),
            SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class,
            VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
    thetaSummer.setCombinerClass(VectorSumReducer.class);
    thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI);
    thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary);
    /* TODO(robinanil): Enable this when thetanormalization works.
    succeeded = thetaSummer.waitForCompletion(true);
    if (!succeeded) {
      return -1;
    }*/

    //validate our model and then write it out to the official output
    getConf().setFloat(ThetaMapper.ALPHA_I, alphaI);
    NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf());
    naiveBayesModel.validate();
    naiveBayesModel.serialize(getOutputPath(), getConf());

    return 0;
}

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addOptions();//w  w  w.  j a v  a  2 s.co m
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    if (parseArguments(args) == null) {
        return -1;
    }

    Map<String, String> options = parseOptions();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD)
            .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) {
        runSequential(getConf(), getInputPath(), output, options);
    } else {
        runMapReduce(getInputPath(), output);
    }

    return 0;
}

From source file:com.netease.news.text.SequenceFilesFromDirectory.java

License:Apache License

/**
 * Override this method in order to add additional options to the command
 * line of the SequenceFileFromDirectory job. Do not forget to call super()
 * otherwise all standard options (input/output dirs etc) will not be
 * available./*from  ww  w  . j av a2s.  c om*/
 */
protected void addOptions() {
    addInputOption();
    addOutputOption();
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
    addOption(FILE_FILTER_CLASS_OPTION[0], FILE_FILTER_CLASS_OPTION[1],
            "The name of the class to use for file parsing. Default: " + PREFIX_ADDITION_FILTER,
            PREFIX_ADDITION_FILTER);
    addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
    addOption(CHARSET_OPTION[0], CHARSET_OPTION[1],
            "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Configure this instance based on the command-line arguments contained within provided array.
 * Calls {@link #validate()} to ensure consistency of configuration.
 *
 * @return true if the arguments were parsed successfully and execution should proceed.
 * @throws Exception if there is a problem parsing the command-line arguments or the particular
 *                   combination would violate class invariants.
 *//*from   w ww .j a va 2s . co m*/
private boolean parseArgs(String[] args) throws Exception {

    addInputOption();
    addOption("trainingOutput", "tr", "The training data output directory", false);
    addOption("testOutput", "te", "The test data output directory", false);
    addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
    addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
    addOption("splitLocation", "sl",
            "Location for start of test data expressed as a percentage of the input file "
                    + "size (0=start, 50=middle, 100=end",
            false);
    addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
    addOption("randomSelectionPct", "rp",
            "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false);
    addOption("charset", "c",
            "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)",
            false);
    addOption(buildOption("sequenceFiles", "seq",
            "Set if the input files are sequence files.  Default is false", false, false, "false"));
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    //TODO: extend this to sequential mode
    addOption("keepPct", "k",
            "The percentage of total data to keep in map-reduce mode, the rest will be ignored.  "
                    + "Default is 100%",
            false);
    addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);

    if (parseArguments(args) == null) {
        return false;
    }

    try {
        inputDirectory = getInputPath();

        useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION)
                .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD);

        if (useMapRed) {
            if (!hasOption("randomSelectionPct")) {
                throw new OptionException(getCLIOption("randomSelectionPct"),
                        "must set randomSelectionPct when mapRed option is used");
            }
            if (!hasOption("mapRedOutputDir")) {
                throw new OptionException(getCLIOption("mapRedOutputDir"),
                        "mapRedOutputDir must be set when mapRed option is used");
            }
            mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
            if (hasOption("keepPct")) {
                keepPct = Integer.parseInt(getOption("keepPct"));
            }
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(getConf(), mapRedOutputDirectory);
            }
        } else {
            if (!hasOption("trainingOutput") || !hasOption("testOutput")) {
                throw new OptionException(getCLIOption("trainingOutput"),
                        "trainingOutput and testOutput must be set if mapRed option is not used");
            }
            if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct")
                    && !hasOption("randomSelectionSize")) {
                throw new OptionException(getCLIOption("testSplitSize"),
                        "must set one of test split size/percentage or randomSelectionSize/percentage");
            }

            trainingOutputDirectory = new Path(getOption("trainingOutput"));
            testOutputDirectory = new Path(getOption("testOutput"));
            FileSystem fs = trainingOutputDirectory.getFileSystem(getConf());
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(fs.getConf(), trainingOutputDirectory);
                HadoopUtil.delete(fs.getConf(), testOutputDirectory);
            }
            fs.mkdirs(trainingOutputDirectory);
            fs.mkdirs(testOutputDirectory);
        }

        if (hasOption("charset")) {
            charset = Charset.forName(getOption("charset"));
        }

        if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
            throw new OptionException(getCLIOption("testSplitPct"),
                    "must have either split size or split percentage " + "option, not BOTH");
        }

        if (hasOption("testSplitSize")) {
            setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
        }

        if (hasOption("testSplitPct")) {
            setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
        }

        if (hasOption("splitLocation")) {
            setSplitLocation(Integer.parseInt(getOption("splitLocation")));
        }

        if (hasOption("randomSelectionSize")) {
            setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
        }

        if (hasOption("randomSelectionPct")) {
            setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
        }

        useSequence = hasOption("sequenceFiles");

    } catch (OptionException e) {
        log.error("Command-line option Exception", e);
        CommandLineUtil.printHelp(getGroup());
        return false;
    }

    validate();
    return true;
}