Example usage for org.apache.commons.cli2 CommandLine getValue

List of usage examples for org.apache.commons.cli2 CommandLine getValue

Introduction

In this page you can find the example usage for org.apache.commons.cli2 CommandLine getValue.

Prototype

Object getValue(final Option option) throws IllegalStateException;

Source Link

Document

Retrieves the single Argument value associated with the specified Option

Usage

From source file:com.tamingtext.util.SplitInput.java

/** Configure this instance based on the command-line arguments contained within provided array. 
 * Calls {@link #validate()} to ensure consistency of configuration.
 * //from  w w  w . ja v a  2  s  . c om
 * @return true if the arguments were parsed successfully and execution should proceed.
 * @throws Exception if there is a problem parsing the command-line arguments or the particular
 *   combination would violate class invariants.
 */
public boolean parseArgs(String[] args) throws Exception {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt = obuilder.withLongName("inputDir").withRequired(true)
            .withArgument(abuilder.withName("inputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory").withShortName("i").create();

    Option trainingOutputDirOpt = obuilder.withLongName("trainingOutputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The training data output directory").withShortName("tr").create();

    Option testOutputDirOpt = obuilder.withLongName("testOutputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The test data output directory").withShortName("te").create();

    Option testSplitSizeOpt = obuilder.withLongName("testSplitSize").withRequired(false)
            .withArgument(abuilder.withName("splitSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The number of documents held back as test data for each category")
            .withShortName("ss").create();

    Option testSplitPctOpt = obuilder.withLongName("testSplitPct").withRequired(false)
            .withArgument(abuilder.withName("splitPct").withMinimum(1).withMaximum(1).create())
            .withDescription("The percentage of documents held back as test data for each category")
            .withShortName("sp").create();

    Option splitLocationOpt = obuilder.withLongName("splitLocation").withRequired(false)
            .withArgument(abuilder.withName("splitLoc").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "Location for start of test data expressed as a percentage of the input file size (0=start, 50=middle, 100=end")
            .withShortName("sl").create();

    Option randomSelectionSizeOpt = obuilder.withLongName("randomSelectionSize").withRequired(false)
            .withArgument(abuilder.withName("randomSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The number of itemr to be randomly selected as test data ").withShortName("rs")
            .create();

    Option randomSelectionPctOpt = obuilder.withLongName("randomSelectionPct").withRequired(false)
            .withArgument(abuilder.withName("randomPct").withMinimum(1).withMaximum(1).create())
            .withDescription("Percentage of items to be randomly selected as test data ").withShortName("rp")
            .create();

    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The name of the character encoding of the input files").withShortName("c")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(trainingOutputDirOpt)
            .withOption(testOutputDirOpt).withOption(testSplitSizeOpt).withOption(testSplitPctOpt)
            .withOption(splitLocationOpt).withOption(randomSelectionSizeOpt).withOption(randomSelectionPctOpt)
            .withOption(charsetOpt).create();

    try {

        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return false;
        }

        inputDirectory = new Path((String) cmdLine.getValue(inputDirOpt));
        trainingOutputDirectory = new Path((String) cmdLine.getValue(trainingOutputDirOpt));
        testOutputDirectory = new Path((String) cmdLine.getValue(testOutputDirOpt));

        charset = Charset.forName((String) cmdLine.getValue(charsetOpt));

        if (cmdLine.hasOption(testSplitSizeOpt) && cmdLine.hasOption(testSplitPctOpt)) {
            throw new OptionException(testSplitSizeOpt,
                    "must have either split size or split percentage option, not BOTH");
        } else if (!cmdLine.hasOption(testSplitSizeOpt) && !cmdLine.hasOption(testSplitPctOpt)) {
            throw new OptionException(testSplitSizeOpt,
                    "must have either split size or split percentage option");
        }

        if (cmdLine.hasOption(testSplitSizeOpt)) {
            setTestSplitSize(Integer.parseInt((String) cmdLine.getValue(testSplitSizeOpt)));
        }

        if (cmdLine.hasOption(testSplitPctOpt)) {
            setTestSplitPct(Integer.parseInt((String) cmdLine.getValue(testSplitPctOpt)));
        }

        if (cmdLine.hasOption(splitLocationOpt)) {
            setSplitLocation(Integer.parseInt((String) cmdLine.getValue(splitLocationOpt)));
        }

        if (cmdLine.hasOption(randomSelectionSizeOpt)) {
            setTestRandomSelectionSize(Integer.parseInt((String) cmdLine.getValue(randomSelectionSizeOpt)));
        }

        if (cmdLine.hasOption(randomSelectionPctOpt)) {
            setTestRandomSelectionPct(Integer.parseInt((String) cmdLine.getValue(randomSelectionPctOpt)));
        }

        fs.mkdirs(trainingOutputDirectory);
        fs.mkdirs(testOutputDirectory);

    } catch (OptionException e) {
        log.error("Command-line option Exception", e);
        CommandLineUtil.printHelp(group);
        return false;
    }

    validate();
    return true;
}

From source file:com.ml.hadoop.nlp.SparseVectorsFromSequenceFiles.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option dictionaryPathOpt = obuilder.withLongName("dictionaryPath")
            .withArgument(abuilder.withName("dictionaryPath").withMinimum(1).withMaximum(1).create())
            .withDescription("Dictionary path for update TFIDF").withShortName("dp").create();

    Option docFrequencyPathOpt = obuilder.withLongName("docFrequencyPath")
            .withArgument(abuilder.withName("docFrequencyPath").withMinimum(1).withMaximum(1).create())
            .withDescription("Doc frequency path for update TFIDF").withShortName("dfp").create();

    Option tfVectorsPathOpt = obuilder.withLongName("tfVectorsPath")
            .withArgument(abuilder.withName("tfVectorsPath").withMinimum(1).withMaximum(1).create())
            .withDescription("TF Vectors path").withShortName("tfvp").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF , TFIDF or TFIDF_UPDATE")
            .withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(/*from   w w  w  . j  ava  2 s .  c om*/
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less "
                            + "than 0 no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(dictionaryPathOpt).withOption(docFrequencyPathOpt).withOption(tfVectorsPathOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Changed... Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        //default process tfidf:1, tf:2, update tfidf:3
        int processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = 2;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = 1;
            } else if ("tfidf_update".equalsIgnoreCase(wString)) {
                processIdf = 3;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = 1;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }
        log.info("Tokenizing documents in {}", inputDir);
        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0 || maxDFPercent > 0.00;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
        log.info("Creating Term Frequency Vectors, prune {}", shouldPrune);

        String dictionaryPath = null;
        if (cmdLine.hasOption(dictionaryPathOpt)) {
            dictionaryPath = (String) cmdLine.getValue(dictionaryPathOpt);
            log.info("begin dic path {}", dictionaryPath);
        }

        if (processIdf == 1) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else if (processIdf == 3) {
            log.info("begin update term----------------");
            DictionaryVectorizer.createUpdateTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    maxNGramSize, dictionaryPath, norm, logNormalize, reduceTasks, sequentialAccessOutput,
                    namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }

        String docFrequencyPaths = null;
        if (cmdLine.hasOption(dictionaryPathOpt)) {
            docFrequencyPaths = (String) cmdLine.getValue(docFrequencyPathOpt);
            log.info("doc frequency path {}", docFrequencyPaths);
        }
        String tfVectorsPaths = null;
        if (cmdLine.hasOption(tfVectorsPathOpt)) {
            tfVectorsPaths = (String) cmdLine.getValue(tfVectorsPathOpt);
            log.info("tf vectors path {}", tfVectorsPaths);
        }

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (processIdf == 1) {
            log.info("Calculating IDF");
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
            log.info("...docFrequencyPathBase {}, docFrequencyFile {}", docFrequenciesFeatures.getFirst()[0],
                    docFrequenciesFeatures.getFirst()[1]);
        } else if (processIdf == 3) {
            // load docFrequency path
            List<Path> docFrequencyChunks = Lists.newArrayList();
            String[] paths = docFrequencyPaths.split(",");

            long featureCount = 0;
            for (String path : paths) {
                int splitPos = path.lastIndexOf("/");
                String docFrequencyPathBase = path.substring(0, splitPos);
                String docFrequencyFile = path.substring(splitPos + 1, path.length());
                log.info("docFrequencyPathBase {}, docFrequencyFile {}", docFrequencyPathBase,
                        docFrequencyFile);
                Path docFrequencyPath = new Path(docFrequencyPathBase, docFrequencyFile);
                docFrequencyChunks.add(docFrequencyPath);

                /*for (Pair<IntWritable, LongWritable> record
                         : new SequenceFileIterable<IntWritable, LongWritable>(docFrequencyPath, true, conf)) {
                     featureCount = Math.max(record.getFirst().get(), featureCount);
                 }*/
            }
            featureCount = 107623;
            featureCount++;

            long vectorCount = Long.MAX_VALUE;
            /*Path tfDirPath = new Path(tfVectorsPaths + "/part-r-00000");
            int i = 0;
            for (Pair<Text, VectorWritable> record
                     : new SequenceFileIterable<Text, VectorWritable>(tfDirPath, true, conf)) {
               i++;
             }
            if (i > 0) {
               vectorCount = i;
            }*/
            vectorCount = 80000;
            //read docFrequencyFile to get featureCount and vectorCount
            Long[] counts = { featureCount, vectorCount };
            log.info("featureCount {}, vectorCount------------------ {}", featureCount, vectorCount);
            docFrequenciesFeatures = new Pair<Long[], List<Path>>(counts, docFrequencyChunks);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            if (maxDFSigma >= 0.0) {
                Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
                Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

                // Calculate the standard deviation
                double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
                maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
            }

            long maxDFThreshold = (long) (vectorCount * (maxDF / 100.0f));

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            log.info("Pruning");
            if (processIdf == 1 || processIdf == 3) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf == 1 || processIdf == 3) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.mahout.SparseVectorsFromBehemoth.java

public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option typeNameOpt = obuilder.withLongName("typeToken").withRequired(false)
            .withArgument(abuilder.withName("typeToken").withMinimum(1).withMaximum(1).create())
            .withDescription("The annotation type for Tokens").withShortName("t").create();

    Option featureNameOpt = obuilder.withLongName("featureName").withRequired(false)
            .withArgument(abuilder.withName("featureName").withMinimum(1).withMaximum(1).create())
            .withDescription(/*  w  w  w .j a v a2s . c om*/
                    "The name of the feature containing the token values, uses the text if unspecified")
            .withShortName("f").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();

    Option labelMDOpt = obuilder.withLongName("labelMDKey").withRequired(false)
            .withArgument(abuilder.withName("label_md_key").create())
            .withDescription("Document metadata holding the label").withShortName("label").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(typeNameOpt)
            .withOption(featureNameOpt).withOption(analyzerNameOpt).withOption(chunkSizeOpt)
            .withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt).withOption(maxDFSigmaOpt)
            .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
            .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
            .withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
            .withOption(logNormalizeOpt).withOption(labelMDOpt).create();
    CommandLine cmdLine = null;
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        if (!cmdLine.hasOption(inputDirOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        if (!cmdLine.hasOption(outputDirOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return -1;
    }

    Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
    Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

    int chunkSize = 100;
    if (cmdLine.hasOption(chunkSizeOpt)) {
        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
    }
    int minSupport = 2;
    if (cmdLine.hasOption(minSupportOpt)) {
        String minSupportString = (String) cmdLine.getValue(minSupportOpt);
        minSupport = Integer.parseInt(minSupportString);
    }

    int maxNGramSize = 1;

    if (cmdLine.hasOption(maxNGramSizeOpt)) {
        try {
            maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
        } catch (NumberFormatException ex) {
            log.warn("Could not parse ngram size option");
        }
    }
    log.info("Maximum n-gram size is: {}", maxNGramSize);

    if (cmdLine.hasOption(overwriteOutput)) {
        HadoopUtil.delete(getConf(), outputDir);
    }

    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
    if (cmdLine.hasOption(minLLROpt)) {
        minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
    }
    log.info("Minimum LLR value: {}", minLLRValue);

    int reduceTasks = 1;
    if (cmdLine.hasOption(numReduceTasksOpt)) {
        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
    }
    log.info("Number of reduce tasks: {}", reduceTasks);

    Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
    if (cmdLine.hasOption(analyzerNameOpt)) {
        String className = cmdLine.getValue(analyzerNameOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it
        // if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
    }

    String type = null;
    String featureName = "";
    if (cmdLine.hasOption(typeNameOpt)) {
        type = cmdLine.getValue(typeNameOpt).toString();
        Object tempFN = cmdLine.getValue(featureNameOpt);
        if (tempFN != null) {
            featureName = tempFN.toString();
            log.info("Getting tokens from " + type + "." + featureName.toString());
        } else
            log.info("Getting tokens from " + type);
    }

    boolean processIdf;

    if (cmdLine.hasOption(weightOpt)) {
        String wString = cmdLine.getValue(weightOpt).toString();
        if ("tf".equalsIgnoreCase(wString)) {
            processIdf = false;
        } else if ("tfidf".equalsIgnoreCase(wString)) {
            processIdf = true;
        } else {
            throw new OptionException(weightOpt);
        }
    } else {
        processIdf = true;
    }

    int minDf = 1;
    if (cmdLine.hasOption(minDFOpt)) {
        minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
    }
    int maxDFPercent = 99;
    if (cmdLine.hasOption(maxDFPercentOpt)) {
        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
    }
    double maxDFSigma = -1.0;
    if (cmdLine.hasOption(maxDFSigmaOpt)) {
        maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
    }

    float norm = PartialVectorMerger.NO_NORMALIZING;
    if (cmdLine.hasOption(powerOpt)) {
        String power = cmdLine.getValue(powerOpt).toString();
        if ("INF".equals(power)) {
            norm = Float.POSITIVE_INFINITY;
        } else {
            norm = Float.parseFloat(power);
        }
    }

    boolean logNormalize = false;
    if (cmdLine.hasOption(logNormalizeOpt)) {
        logNormalize = true;
    }

    String labelMDKey = null;
    if (cmdLine.hasOption(labelMDOpt)) {
        labelMDKey = cmdLine.getValue(labelMDOpt).toString();
    }

    Configuration conf = getConf();
    Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

    // no annotation type degfin
    if (type != null) {
        BehemothDocumentProcessor.tokenizeDocuments(inputDir, type, featureName, tokenizedPath);
    }
    // no annotation type defined : rely on Lucene's analysers
    else {
        BehemothDocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);
    }
    boolean sequentialAccessOutput = false;
    if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
        sequentialAccessOutput = true;
    }

    boolean namedVectors = false;
    if (cmdLine.hasOption(namedVectorOpt)) {
        namedVectors = true;
    }
    boolean shouldPrune = maxDFSigma >= 0.0;
    String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
            : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

    try {
        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; // if we are pruning by std dev, then
                                   // this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }

        // dump labels?
        if (labelMDKey != null) {
            conf.set(BehemothDocumentProcessor.MD_LABEL, labelMDKey);
            BehemothDocumentProcessor.dumpLabels(inputDir, new Path(outputDir, "labels"), conf);
        }
    } catch (RuntimeException e) {
        Log.error("Exception caught", e);
        return -1;
    }

    return 0;
}

From source file:org.apache.mahout.avro.text.AvroDocumentsFromDirectory.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    GenericOptionsParser p = new GenericOptionsParser(conf, args);
    args = p.getRemainingArgs();// w  w  w.jav a 2  s .  c  o  m

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option parentOpt = obuilder.withLongName("parent").withRequired(true)
            .withArgument(abuilder.withName("parent").withMinimum(1).withMaximum(1).create())
            .withDescription("Parent dir containing the documents").withShortName("p").create();

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in documents. Defaults to " + DEFAULT_CHUNK_SIZE)
            .withShortName("chunk").create();

    Option keyPrefixOpt = obuilder.withLongName("keyPrefix")
            .withArgument(abuilder.withName("keyPrefix").withMinimum(1).withMaximum(1).create())
            .withDescription("The prefix to be prepended to the key").withShortName("prefix").create();

    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The name of the character encoding of the input files").withShortName("c")
            .create();

    Group group = gbuilder.withName("Options").withOption(keyPrefixOpt).withOption(chunkSizeOpt)
            .withOption(charsetOpt).withOption(outputDirOpt).withOption(parentOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine = parser.parse(args);

    File parentDir = new File((String) cmdLine.getValue(parentOpt));
    String outputDir = (String) cmdLine.getValue(outputDirOpt);

    int documentsPerChunk = DEFAULT_CHUNK_SIZE;
    if (cmdLine.hasOption(chunkSizeOpt)) {
        documentsPerChunk = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
    }

    String prefix = "";
    if (cmdLine.hasOption(keyPrefixOpt)) {
        prefix = (String) cmdLine.getValue(keyPrefixOpt);
    }
    Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
    AvroDocumentsFromDirectory dir = new AvroDocumentsFromDirectory();
    dir.createAvroDocuments(conf, parentDir, outputDir, prefix, documentsPerChunk, charset);
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java

/**
 * Takes in two arguments://from  w w  w.j  a  v a  2 s. c o  m
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents
 * live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the
 * classifier as a {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 * 
 * @param args
 *          The args
 */
public int run(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory path").withShortName("i").create();

    Option dirOutputPathOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory Path").withShortName("o").create();

    Option categoriesOpt = obuilder.withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c").create();

    Option exactMatchOpt = obuilder.withLongName("exactMatch")
            .withDescription("If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e").create();

    Option allOpt = obuilder.withLongName("all").withDescription("If set, Select all files. Default is false")
            .withShortName("all").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return 0;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        String catFile = "";
        if (cmdLine.hasOption(categoriesOpt)) {
            catFile = (String) cmdLine.getValue(categoriesOpt);
        }

        boolean all = false;
        if (cmdLine.hasOption(allOpt)) {
            all = true;
        }
        runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return 0;
    }

    return 1;
}

From source file:org.apache.mahout.benchmark.VectorBenchmarks.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option vectorSizeOpt = obuilder.withLongName("vectorSize").withRequired(false)
            .withArgument(abuilder.withName("vs").withDefault(1000000).create())
            .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create();
    Option numNonZeroOpt = obuilder.withLongName("numNonZero").withRequired(false)
            .withArgument(abuilder.withName("nz").withDefault(1000).create())
            .withDescription("Size of the vector. Default: 1000").withShortName("nz").create();
    Option numVectorsOpt = obuilder.withLongName("numVectors").withRequired(false)
            .withArgument(abuilder.withName("nv").withDefault(25).create())
            .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create();
    Option numClustersOpt = obuilder.withLongName("numClusters").withRequired(false)
            .withArgument(abuilder.withName("nc").withDefault(0).create())
            .withDescription(//from w w w. j a  va 2 s  . co m
                    "Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0")
            .withShortName("nc").create();
    Option numOpsOpt = obuilder.withLongName("numOps").withRequired(false)
            .withArgument(abuilder.withName("numOps").withDefault(10).create())
            .withDescription("Number of operations to do per timer. "
                    + "E.g In distance measure, the distance is calculated numOps times"
                    + " and the total time is measured. Default: 10")
            .withShortName("no").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt)
            .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelpWithGenericOptions(group);
            return;
        }

        int cardinality = 1000000;
        if (cmdLine.hasOption(vectorSizeOpt)) {
            cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt));

        }

        int numClusters = 0;
        if (cmdLine.hasOption(numClustersOpt)) {
            numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt));
        }

        int numNonZero = 1000;
        if (cmdLine.hasOption(numNonZeroOpt)) {
            numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt));
        }

        int numVectors = 25;
        if (cmdLine.hasOption(numVectorsOpt)) {
            numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt));

        }

        int numOps = 10;
        if (cmdLine.hasOption(numOpsOpt)) {
            numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt));

        }
        VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps);
        runBenchmark(mark);

        // log.info("\n{}", mark);
        log.info("\n{}", mark.asCsvString());
    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.cf.taste.example.TasteOptionParser.java

/**
 * Parse the given command line arguments.
 * @param args the arguments as given to the application.
 * @return the input file if a file was given on the command line, null otherwise.
 */// w  ww  .ja v  a 2 s .c o  m
public static File getRatings(String[] args) throws OptionException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i")
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The Path for input data directory.").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine = parser.parse(args);

    if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return null;
    }

    return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null;
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor.java

/**
 * Execute a bayes classification job. Input and output path are parsed from the input parameters.
 * /*from w w  w  .j  av  a  2  s .co  m*/
 * @param args
 *          input parameters.
 * @param job
 *          the job to execute.
 * @throws Exception
 *           any exception thrown at job execution.
 * */
public static void execute(String[] args, BayesJob job) throws IOException {
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();
    Option outputOpt = DefaultOptionCreator.outputOption().create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path input = new Path(cmdLine.getValue(inputOpt).toString());
        Path output = new Path(cmdLine.getValue(outputOpt).toString());

        BayesParameters bayesParams = new BayesParameters();
        bayesParams.setGramSize(1);
        job.runJob(input, output, bayesParams);
    } catch (OptionException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Option parentOpt = obuilder.withLongName("parent").withRequired(true)
            .withArgument(abuilder.withName("parent").withMinimum(1).withMaximum(1).create())
            .withDescription("Parent dir containing the newsgroups").withShortName("p").create();

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName").withRequired(true)
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The name of the character encoding of the input files").withShortName("c")
            .create();//from w w  w .jav  a  2s  .  c  o m

    Group group = gbuilder.withName("Options").withOption(analyzerNameOpt).withOption(charsetOpt)
            .withOption(outputDirOpt).withOption(parentOpt).withOption(helpOpt).create();
    try {

        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File parentDir = new File((String) cmdLine.getValue(parentOpt));
        File outputDir = new File((String) cmdLine.getValue(outputDirOpt));
        String analyzerName = (String) cmdLine.getValue(analyzerNameOpt);
        Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
        Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class);
        // parent dir contains dir by category
        if (!parentDir.exists()) {
            throw new FileNotFoundException("Can't find input directory " + parentDir);
        }
        File[] categoryDirs = parentDir.listFiles();
        for (File dir : categoryDirs) {
            if (dir.isDirectory()) {
                if (!outputDir.exists() && !outputDir.mkdirs()) {
                    throw new IllegalStateException("Can't create output directory");
                }

                File outputFile = new File(outputDir, dir.getName() + ".txt");
                BayesFileFormatter.collapse(dir.getName(), analyzer, dir, charset, outputFile);
            }
        }
    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.TestClassifier.java

public static void main(String[] args) throws IOException, InvalidDatastoreException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option pathOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("The path on HDFS as defined by the -source parameter").withShortName("m")
            .create();/*from w ww. j  a  va2s .c  o  m*/

    Option dirOpt = obuilder.withLongName("testDir").withRequired(true)
            .withArgument(abuilder.withName("testDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The directory where test documents resides in").withShortName("d").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option encodingOpt = obuilder.withLongName("encoding")
            .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
            .withDescription("The file encoding.  Defaults to UTF-8").withShortName("e").create();

    Option defaultCatOpt = obuilder.withLongName("defaultCat")
            .withArgument(abuilder.withName("defaultCat").withMinimum(1).withMaximum(1).create())
            .withDescription("The default category Default Value: unknown").withShortName("default").create();

    Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Size of the n-gram. Default Value: 1").withShortName("ng").create();

    Option alphaOpt = obuilder.withLongName("alpha").withRequired(false)
            .withArgument(abuilder.withName("a").withMinimum(1).withMaximum(1).create())
            .withDescription("Smoothing parameter Default Value: 1.0").withShortName("a").create();

    Option verboseOutputOpt = obuilder.withLongName("verbose").withRequired(false)
            .withDescription("Output which values were correctly and incorrectly classified").withShortName("v")
            .create();

    Option typeOpt = obuilder.withLongName("classifierType").withRequired(false)
            .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create())
            .withDescription("Type of classifier: bayes|cbayes. Default Value: bayes").withShortName("type")
            .create();

    Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(false)
            .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of model: hdfs").withShortName("source").create();

    Option methodOpt = obuilder.withLongName("method").withRequired(false)
            .withArgument(abuilder.withName("method").withMinimum(1).withMaximum(1).create())
            .withDescription("Method of Classification: sequential|mapreduce. Default Value: mapreduce")
            .withShortName("method").create();

    Option confusionMatrixOpt = obuilder.withLongName("confusionMatrix").withRequired(false)
            .withArgument(abuilder.withName("confusionMatrix").withMinimum(1).withMaximum(1).create())
            .withDescription("Export ConfusionMatrix as SequenceFile").withShortName("cm").create();

    Group group = gbuilder.withName("Options").withOption(defaultCatOpt).withOption(dirOpt)
            .withOption(encodingOpt).withOption(gramSizeOpt).withOption(pathOpt).withOption(typeOpt)
            .withOption(dataSourceOpt).withOption(helpOpt).withOption(methodOpt).withOption(verboseOutputOpt)
            .withOption(alphaOpt).withOption(confusionMatrixOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        BayesParameters params = new BayesParameters();
        // Setting all default values
        int gramSize = 1;

        String modelBasePath = (String) cmdLine.getValue(pathOpt);

        if (cmdLine.hasOption(gramSizeOpt)) {
            gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt));

        }

        String classifierType = "bayes";
        if (cmdLine.hasOption(typeOpt)) {
            classifierType = (String) cmdLine.getValue(typeOpt);
        }

        String dataSource = "hdfs";
        if (cmdLine.hasOption(dataSourceOpt)) {
            dataSource = (String) cmdLine.getValue(dataSourceOpt);
        }

        String defaultCat = "unknown";
        if (cmdLine.hasOption(defaultCatOpt)) {
            defaultCat = (String) cmdLine.getValue(defaultCatOpt);
        }

        String encoding = "UTF-8";
        if (cmdLine.hasOption(encodingOpt)) {
            encoding = (String) cmdLine.getValue(encodingOpt);
        }

        String alphaI = "1.0";
        if (cmdLine.hasOption(alphaOpt)) {
            alphaI = (String) cmdLine.getValue(alphaOpt);
        }

        boolean verbose = cmdLine.hasOption(verboseOutputOpt);

        String testDirPath = (String) cmdLine.getValue(dirOpt);

        String classificationMethod = "mapreduce";
        if (cmdLine.hasOption(methodOpt)) {
            classificationMethod = (String) cmdLine.getValue(methodOpt);
        }

        String confusionMatrixFile = null;
        if (cmdLine.hasOption(confusionMatrixOpt)) {
            confusionMatrixFile = (String) cmdLine.getValue(confusionMatrixOpt);
        }

        params.setGramSize(gramSize);
        params.set("verbose", Boolean.toString(verbose));
        params.setBasePath(modelBasePath);
        params.set("classifierType", classifierType);
        params.set("dataSource", dataSource);
        params.set("defaultCat", defaultCat);
        params.set("encoding", encoding);
        params.set("alpha_i", alphaI);
        params.set("testDirPath", testDirPath);
        params.set("confusionMatrix", confusionMatrixFile);

        if ("sequential".equalsIgnoreCase(classificationMethod)) {
            classifySequential(params);
        } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) {
            classifyParallel(params);
        }
    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
    }
}