Example usage for org.apache.commons.cli2 CommandLine hasOption

Introduction

In this page you can find the example usage for org.apache.commons.cli2 CommandLine hasOption.

Prototype

boolean hasOption(final Option option);

Source Link

Document

Detects the presence of an option in this CommandLine.

Usage

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromTokenizedDoc.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(//from www  .j  a v  a 2 s.  co  m
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it
            // if
            // you can't instantiate it
            ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }

        /* modification starts here */
        Configuration conf = getConf();
        //         Path tokenizedPath = new Path(outputDir,
        //               DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //         DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
        //               tokenizedPath, conf);
        Path tokenizedPath = inputDir;
        /* end modification */

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent;// if we are pruning by std dev, then this
        // will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0D, conf);
            maxDF = (int) (maxDFSigma * stdDev);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:com.netease.news.vectorizer.SparseVectorsFromSequenceFiles.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(/*  w  w  w .  j a va2  s  . c o  m*/
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less "
                            + "than 0 no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = IKAnalyzer.class;
        //      Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }
        log.info("Tokenizing documents in {}", inputDir);
        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
        // to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0 || maxDFPercent > 0.00;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
        log.info("Creating Term Frequency Vectors");
        if (processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            log.info("Calculating IDF");
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            if (maxDFSigma >= 0.0) {
                Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
                Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

                // Calculate the standard deviation
                double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
                maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
            }

            long maxDFThreshold = (long) (vectorCount * (maxDF / 100.0f));

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            log.info("Pruning");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:com.tamingtext.util.SplitInput.java

/** Configure this instance based on the command-line arguments contained within provided array. 
 * Calls {@link #validate()} to ensure consistency of configuration.
 * //  www . java  2 s . co m
 * @return true if the arguments were parsed successfully and execution should proceed.
 * @throws Exception if there is a problem parsing the command-line arguments or the particular
 *   combination would violate class invariants.
 */
public boolean parseArgs(String[] args) throws Exception {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt = obuilder.withLongName("inputDir").withRequired(true)
            .withArgument(abuilder.withName("inputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory").withShortName("i").create();

    Option trainingOutputDirOpt = obuilder.withLongName("trainingOutputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The training data output directory").withShortName("tr").create();

    Option testOutputDirOpt = obuilder.withLongName("testOutputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The test data output directory").withShortName("te").create();

    Option testSplitSizeOpt = obuilder.withLongName("testSplitSize").withRequired(false)
            .withArgument(abuilder.withName("splitSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The number of documents held back as test data for each category")
            .withShortName("ss").create();

    Option testSplitPctOpt = obuilder.withLongName("testSplitPct").withRequired(false)
            .withArgument(abuilder.withName("splitPct").withMinimum(1).withMaximum(1).create())
            .withDescription("The percentage of documents held back as test data for each category")
            .withShortName("sp").create();

    Option splitLocationOpt = obuilder.withLongName("splitLocation").withRequired(false)
            .withArgument(abuilder.withName("splitLoc").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "Location for start of test data expressed as a percentage of the input file size (0=start, 50=middle, 100=end")
            .withShortName("sl").create();

    Option randomSelectionSizeOpt = obuilder.withLongName("randomSelectionSize").withRequired(false)
            .withArgument(abuilder.withName("randomSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The number of itemr to be randomly selected as test data ").withShortName("rs")
            .create();

    Option randomSelectionPctOpt = obuilder.withLongName("randomSelectionPct").withRequired(false)
            .withArgument(abuilder.withName("randomPct").withMinimum(1).withMaximum(1).create())
            .withDescription("Percentage of items to be randomly selected as test data ").withShortName("rp")
            .create();

    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The name of the character encoding of the input files").withShortName("c")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputDirOpt).withOption(trainingOutputDirOpt)
            .withOption(testOutputDirOpt).withOption(testSplitSizeOpt).withOption(testSplitPctOpt)
            .withOption(splitLocationOpt).withOption(randomSelectionSizeOpt).withOption(randomSelectionPctOpt)
            .withOption(charsetOpt).create();

    try {

        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return false;
        }

        inputDirectory = new Path((String) cmdLine.getValue(inputDirOpt));
        trainingOutputDirectory = new Path((String) cmdLine.getValue(trainingOutputDirOpt));
        testOutputDirectory = new Path((String) cmdLine.getValue(testOutputDirOpt));

        charset = Charset.forName((String) cmdLine.getValue(charsetOpt));

        if (cmdLine.hasOption(testSplitSizeOpt) && cmdLine.hasOption(testSplitPctOpt)) {
            throw new OptionException(testSplitSizeOpt,
                    "must have either split size or split percentage option, not BOTH");
        } else if (!cmdLine.hasOption(testSplitSizeOpt) && !cmdLine.hasOption(testSplitPctOpt)) {
            throw new OptionException(testSplitSizeOpt,
                    "must have either split size or split percentage option");
        }

        if (cmdLine.hasOption(testSplitSizeOpt)) {
            setTestSplitSize(Integer.parseInt((String) cmdLine.getValue(testSplitSizeOpt)));
        }

        if (cmdLine.hasOption(testSplitPctOpt)) {
            setTestSplitPct(Integer.parseInt((String) cmdLine.getValue(testSplitPctOpt)));
        }

        if (cmdLine.hasOption(splitLocationOpt)) {
            setSplitLocation(Integer.parseInt((String) cmdLine.getValue(splitLocationOpt)));
        }

        if (cmdLine.hasOption(randomSelectionSizeOpt)) {
            setTestRandomSelectionSize(Integer.parseInt((String) cmdLine.getValue(randomSelectionSizeOpt)));
        }

        if (cmdLine.hasOption(randomSelectionPctOpt)) {
            setTestRandomSelectionPct(Integer.parseInt((String) cmdLine.getValue(randomSelectionPctOpt)));
        }

        fs.mkdirs(trainingOutputDirectory);
        fs.mkdirs(testOutputDirectory);

    } catch (OptionException e) {
        log.error("Command-line option Exception", e);
        CommandLineUtil.printHelp(group);
        return false;
    }

    validate();
    return true;
}

From source file:com.ml.hadoop.nlp.SparseVectorsFromSequenceFiles.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option dictionaryPathOpt = obuilder.withLongName("dictionaryPath")
            .withArgument(abuilder.withName("dictionaryPath").withMinimum(1).withMaximum(1).create())
            .withDescription("Dictionary path for update TFIDF").withShortName("dp").create();

    Option docFrequencyPathOpt = obuilder.withLongName("docFrequencyPath")
            .withArgument(abuilder.withName("docFrequencyPath").withMinimum(1).withMaximum(1).create())
            .withDescription("Doc frequency path for update TFIDF").withShortName("dfp").create();

    Option tfVectorsPathOpt = obuilder.withLongName("tfVectorsPath")
            .withArgument(abuilder.withName("tfVectorsPath").withMinimum(1).withMaximum(1).create())
            .withDescription("TF Vectors path").withShortName("tfvp").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF , TFIDF or TFIDF_UPDATE")
            .withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(/* ww  w. j  av  a  2s .  co  m*/
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less "
                            + "than 0 no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(dictionaryPathOpt).withOption(docFrequencyPathOpt).withOption(tfVectorsPathOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Changed... Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        //default process tfidf:1, tf:2, update tfidf:3
        int processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = 2;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = 1;
            } else if ("tfidf_update".equalsIgnoreCase(wString)) {
                processIdf = 3;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = 1;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }
        log.info("Tokenizing documents in {}", inputDir);
        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0 || maxDFPercent > 0.00;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
        log.info("Creating Term Frequency Vectors, prune {}", shouldPrune);

        String dictionaryPath = null;
        if (cmdLine.hasOption(dictionaryPathOpt)) {
            dictionaryPath = (String) cmdLine.getValue(dictionaryPathOpt);
            log.info("begin dic path {}", dictionaryPath);
        }

        if (processIdf == 1) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else if (processIdf == 3) {
            log.info("begin update term----------------");
            DictionaryVectorizer.createUpdateTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    maxNGramSize, dictionaryPath, norm, logNormalize, reduceTasks, sequentialAccessOutput,
                    namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }

        String docFrequencyPaths = null;
        if (cmdLine.hasOption(dictionaryPathOpt)) {
            docFrequencyPaths = (String) cmdLine.getValue(docFrequencyPathOpt);
            log.info("doc frequency path {}", docFrequencyPaths);
        }
        String tfVectorsPaths = null;
        if (cmdLine.hasOption(tfVectorsPathOpt)) {
            tfVectorsPaths = (String) cmdLine.getValue(tfVectorsPathOpt);
            log.info("tf vectors path {}", tfVectorsPaths);
        }

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (processIdf == 1) {
            log.info("Calculating IDF");
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
            log.info("...docFrequencyPathBase {}, docFrequencyFile {}", docFrequenciesFeatures.getFirst()[0],
                    docFrequenciesFeatures.getFirst()[1]);
        } else if (processIdf == 3) {
            // load docFrequency path
            List<Path> docFrequencyChunks = Lists.newArrayList();
            String[] paths = docFrequencyPaths.split(",");

            long featureCount = 0;
            for (String path : paths) {
                int splitPos = path.lastIndexOf("/");
                String docFrequencyPathBase = path.substring(0, splitPos);
                String docFrequencyFile = path.substring(splitPos + 1, path.length());
                log.info("docFrequencyPathBase {}, docFrequencyFile {}", docFrequencyPathBase,
                        docFrequencyFile);
                Path docFrequencyPath = new Path(docFrequencyPathBase, docFrequencyFile);
                docFrequencyChunks.add(docFrequencyPath);

                /*for (Pair<IntWritable, LongWritable> record
                         : new SequenceFileIterable<IntWritable, LongWritable>(docFrequencyPath, true, conf)) {
                     featureCount = Math.max(record.getFirst().get(), featureCount);
                 }*/
            }
            featureCount = 107623;
            featureCount++;

            long vectorCount = Long.MAX_VALUE;
            /*Path tfDirPath = new Path(tfVectorsPaths + "/part-r-00000");
            int i = 0;
            for (Pair<Text, VectorWritable> record
                     : new SequenceFileIterable<Text, VectorWritable>(tfDirPath, true, conf)) {
               i++;
             }
            if (i > 0) {
               vectorCount = i;
            }*/
            vectorCount = 80000;
            //read docFrequencyFile to get featureCount and vectorCount
            Long[] counts = { featureCount, vectorCount };
            log.info("featureCount {}, vectorCount------------------ {}", featureCount, vectorCount);
            docFrequenciesFeatures = new Pair<Long[], List<Path>>(counts, docFrequencyChunks);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            if (maxDFSigma >= 0.0) {
                Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
                Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

                // Calculate the standard deviation
                double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
                maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
            }

            long maxDFThreshold = (long) (vectorCount * (maxDF / 100.0f));

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            log.info("Pruning");
            if (processIdf == 1 || processIdf == 3) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf == 1 || processIdf == 3) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:my.mahout.AbstractJob.java

/**
 * Obtain input and output directories from command-line options or hadoop
 *  properties. If {@code addInputOption} or {@code addOutputOption}
 *  has been called, this method will throw an {@code OptionException} if
 *  no source (command-line or property) for that value is present. 
 *  Otherwise, {@code inputPath} or {@code outputPath} will be
 *  non-null only if specified as a hadoop property. Command-line options
 *  take precedence over hadoop properties.
 *
 * @throws IllegalArgumentException if either inputOption is present,
 *   and neither {@code --input} nor {@code -Dmapred.input dir} are
 *   specified or outputOption is present and neither {@code --output}
 *   nor {@code -Dmapred.output.dir} are specified.
 *//*from  w ww  .  j  a  v a 2 s. c om*/
protected void parseDirectories(CommandLine cmdLine, boolean inputOptional, boolean outputOptional) {

    Configuration conf = getConf();

    if (inputOption != null && cmdLine.hasOption(inputOption)) {
        this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
        this.inputFile = new File(cmdLine.getValue(inputOption).toString());
    }
    if (inputPath == null && conf.get("mapred.input.dir") != null) {
        this.inputPath = new Path(conf.get("mapred.input.dir"));
    }

    if (outputOption != null && cmdLine.hasOption(outputOption)) {
        this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
        this.outputFile = new File(cmdLine.getValue(outputOption).toString());
    }
    if (outputPath == null && conf.get("mapred.output.dir") != null) {
        this.outputPath = new Path(conf.get("mapred.output.dir"));
    }

    Preconditions.checkArgument(inputOptional || inputOption == null || inputPath != null,
            "No input specified or -Dmapred.input.dir must be provided to specify input directory");
    Preconditions.checkArgument(outputOptional || outputOption == null || outputPath != null,
            "No output specified:  or -Dmapred.output.dir must be provided to specify output directory");
}

From source file:com.gsinnovations.howdah.AbstractJob.java

/** Parse the arguments specified based on the options defined using the
 *  various <code>addOption</code> methods. If -h is specified or an
 *  exception is encountered print help and return null. Has the
 *  side effect of setting inputPath and outputPath
 *  if <code>addInputOption</code> or <code>addOutputOption</code>
 *  or <code>mapred.input.dir</code> or <code>mapred.output.dir</code>
 *  are present in the Configuration.//ww  w  .j  a va  2s  . c  o m
 *
 * @return a Map<String,Sting> containing options and their argument values.
 *  The presence of a flag can be tested using <code>containsKey</code>, while
 *  argument values can be retrieved using <code>get(optionName</code>. The
 *  names used for keys are the option name parameter prefixed by '--'.
 *
 *
 */
public Map<String, String> parseArguments(String[] args) {

    Option helpOpt = addOption(DefaultOptionCreator.helpOption());
    addOption("tempDir", null, "Intermediate output directory", "temp");
    addOption("startPhase", null, "First phase to run", "0");
    addOption("endPhase", null, "Last phase to run", String.valueOf(Integer.MAX_VALUE));

    GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific Options:");

    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    Group group = gBuilder.create();

    CommandLine cmdLine;
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        cmdLine = parser.parse(args);

    } catch (OptionException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelpWithGenericOptions(group);
        return null;
    }

    if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelpWithGenericOptions(group);
        return null;
    }

    try {
        parseDirectories(cmdLine);
    } catch (IllegalArgumentException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelpWithGenericOptions(group);
        return null;
    }

    argMap = new TreeMap<String, String>();
    maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));

    log.info("Command line arguments: {}", argMap);
    return argMap;
}

From source file:com.digitalpebble.behemoth.mahout.SparseVectorsFromBehemoth.java

public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option typeNameOpt = obuilder.withLongName("typeToken").withRequired(false)
            .withArgument(abuilder.withName("typeToken").withMinimum(1).withMaximum(1).create())
            .withDescription("The annotation type for Tokens").withShortName("t").create();

    Option featureNameOpt = obuilder.withLongName("featureName").withRequired(false)
            .withArgument(abuilder.withName("featureName").withMinimum(1).withMaximum(1).create())
            .withDescription(/*from w  w w.  j ava2s .  c o m*/
                    "The name of the feature containing the token values, uses the text if unspecified")
            .withShortName("f").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();

    Option labelMDOpt = obuilder.withLongName("labelMDKey").withRequired(false)
            .withArgument(abuilder.withName("label_md_key").create())
            .withDescription("Document metadata holding the label").withShortName("label").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(typeNameOpt)
            .withOption(featureNameOpt).withOption(analyzerNameOpt).withOption(chunkSizeOpt)
            .withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt).withOption(maxDFSigmaOpt)
            .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
            .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
            .withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
            .withOption(logNormalizeOpt).withOption(labelMDOpt).create();
    CommandLine cmdLine = null;
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        if (!cmdLine.hasOption(inputDirOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        if (!cmdLine.hasOption(outputDirOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return -1;
    }

    Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
    Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

    int chunkSize = 100;
    if (cmdLine.hasOption(chunkSizeOpt)) {
        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
    }
    int minSupport = 2;
    if (cmdLine.hasOption(minSupportOpt)) {
        String minSupportString = (String) cmdLine.getValue(minSupportOpt);
        minSupport = Integer.parseInt(minSupportString);
    }

    int maxNGramSize = 1;

    if (cmdLine.hasOption(maxNGramSizeOpt)) {
        try {
            maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
        } catch (NumberFormatException ex) {
            log.warn("Could not parse ngram size option");
        }
    }
    log.info("Maximum n-gram size is: {}", maxNGramSize);

    if (cmdLine.hasOption(overwriteOutput)) {
        HadoopUtil.delete(getConf(), outputDir);
    }

    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
    if (cmdLine.hasOption(minLLROpt)) {
        minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
    }
    log.info("Minimum LLR value: {}", minLLRValue);

    int reduceTasks = 1;
    if (cmdLine.hasOption(numReduceTasksOpt)) {
        reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
    }
    log.info("Number of reduce tasks: {}", reduceTasks);

    Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
    if (cmdLine.hasOption(analyzerNameOpt)) {
        String className = cmdLine.getValue(analyzerNameOpt).toString();
        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
        // try instantiating it, b/c there isn't any point in setting it
        // if
        // you can't instantiate it
        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
    }

    String type = null;
    String featureName = "";
    if (cmdLine.hasOption(typeNameOpt)) {
        type = cmdLine.getValue(typeNameOpt).toString();
        Object tempFN = cmdLine.getValue(featureNameOpt);
        if (tempFN != null) {
            featureName = tempFN.toString();
            log.info("Getting tokens from " + type + "." + featureName.toString());
        } else
            log.info("Getting tokens from " + type);
    }

    boolean processIdf;

    if (cmdLine.hasOption(weightOpt)) {
        String wString = cmdLine.getValue(weightOpt).toString();
        if ("tf".equalsIgnoreCase(wString)) {
            processIdf = false;
        } else if ("tfidf".equalsIgnoreCase(wString)) {
            processIdf = true;
        } else {
            throw new OptionException(weightOpt);
        }
    } else {
        processIdf = true;
    }

    int minDf = 1;
    if (cmdLine.hasOption(minDFOpt)) {
        minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
    }
    int maxDFPercent = 99;
    if (cmdLine.hasOption(maxDFPercentOpt)) {
        maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
    }
    double maxDFSigma = -1.0;
    if (cmdLine.hasOption(maxDFSigmaOpt)) {
        maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
    }

    float norm = PartialVectorMerger.NO_NORMALIZING;
    if (cmdLine.hasOption(powerOpt)) {
        String power = cmdLine.getValue(powerOpt).toString();
        if ("INF".equals(power)) {
            norm = Float.POSITIVE_INFINITY;
        } else {
            norm = Float.parseFloat(power);
        }
    }

    boolean logNormalize = false;
    if (cmdLine.hasOption(logNormalizeOpt)) {
        logNormalize = true;
    }

    String labelMDKey = null;
    if (cmdLine.hasOption(labelMDOpt)) {
        labelMDKey = cmdLine.getValue(labelMDOpt).toString();
    }

    Configuration conf = getConf();
    Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

    // no annotation type degfin
    if (type != null) {
        BehemothDocumentProcessor.tokenizeDocuments(inputDir, type, featureName, tokenizedPath);
    }
    // no annotation type defined : rely on Lucene's analysers
    else {
        BehemothDocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);
    }
    boolean sequentialAccessOutput = false;
    if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
        sequentialAccessOutput = true;
    }

    boolean namedVectors = false;
    if (cmdLine.hasOption(namedVectorOpt)) {
        namedVectors = true;
    }
    boolean shouldPrune = maxDFSigma >= 0.0;
    String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
            : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

    try {
        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; // if we are pruning by std dev, then
                                   // this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }

        // dump labels?
        if (labelMDKey != null) {
            conf.set(BehemothDocumentProcessor.MD_LABEL, labelMDKey);
            BehemothDocumentProcessor.dumpLabels(inputDir, new Path(outputDir, "labels"), conf);
        }
    } catch (RuntimeException e) {
        Log.error("Exception caught", e);
        return -1;
    }

    return 0;
}

From source file:my.mahout.AbstractJob.java

/**
 *
 * @param args  The args to parse/* w w w  .  ja  v a2s.c o  m*/
 * @param inputOptional if false, then the input option, if set, need not be present.  If true and input is an option
 *                      and there is no input, then throw an error
 * @param outputOptional if false, then the output option, if set, need not be present.  If true and output is an
 *                       option and there is no output, then throw an error
 * @return the args parsed into a map.
 */
public Map<String, List<String>> parseArguments(String[] args, boolean inputOptional, boolean outputOptional)
        throws IOException {
    Option helpOpt = addOption(DefaultOptionCreator.helpOption());
    addOption("tempDir", null, "Intermediate output directory", "temp");
    addOption("startPhase", null, "First phase to run", "0");
    addOption("endPhase", null, "Last phase to run", String.valueOf(Integer.MAX_VALUE));

    GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific Options:");

    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    group = gBuilder.create();

    CommandLine cmdLine;
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        cmdLine = parser.parse(args);

    } catch (OptionException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelpWithGenericOptions(group, e);
        return null;
    }

    if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelpWithGenericOptions(group);
        return null;
    }

    try {
        parseDirectories(cmdLine, inputOptional, outputOptional);
    } catch (IllegalArgumentException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelpWithGenericOptions(group);
        return null;
    }

    argMap = new TreeMap<String, List<String>>();
    maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));

    this.tempPath = new Path(getOption("tempDir"));

    if (!hasOption("quiet")) {
        log.info("Command line arguments: {}", argMap);
    }
    return argMap;
}

From source file:org.apache.mahout.avro.text.AvroDocumentsFromDirectory.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    GenericOptionsParser p = new GenericOptionsParser(conf, args);
    args = p.getRemainingArgs();/* ww  w . ja  v  a 2 s .c o m*/

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option parentOpt = obuilder.withLongName("parent").withRequired(true)
            .withArgument(abuilder.withName("parent").withMinimum(1).withMaximum(1).create())
            .withDescription("Parent dir containing the documents").withShortName("p").create();

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in documents. Defaults to " + DEFAULT_CHUNK_SIZE)
            .withShortName("chunk").create();

    Option keyPrefixOpt = obuilder.withLongName("keyPrefix")
            .withArgument(abuilder.withName("keyPrefix").withMinimum(1).withMaximum(1).create())
            .withDescription("The prefix to be prepended to the key").withShortName("prefix").create();

    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The name of the character encoding of the input files").withShortName("c")
            .create();

    Group group = gbuilder.withName("Options").withOption(keyPrefixOpt).withOption(chunkSizeOpt)
            .withOption(charsetOpt).withOption(outputDirOpt).withOption(parentOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine = parser.parse(args);

    File parentDir = new File((String) cmdLine.getValue(parentOpt));
    String outputDir = (String) cmdLine.getValue(outputDirOpt);

    int documentsPerChunk = DEFAULT_CHUNK_SIZE;
    if (cmdLine.hasOption(chunkSizeOpt)) {
        documentsPerChunk = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
    }

    String prefix = "";
    if (cmdLine.hasOption(keyPrefixOpt)) {
        prefix = (String) cmdLine.getValue(keyPrefixOpt);
    }
    Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
    AvroDocumentsFromDirectory dir = new AvroDocumentsFromDirectory();
    dir.createAvroDocuments(conf, parentDir, outputDir, prefix, documentsPerChunk, charset);
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaToAvroDocuments.java

/**
 * Takes in two arguments://from   w  w w  .j  a  v a 2s . c  o m
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents
 * live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the
 * classifier as a {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 * 
 * @param args
 *          The args
 */
public int run(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory path").withShortName("i").create();

    Option dirOutputPathOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory Path").withShortName("o").create();

    Option categoriesOpt = obuilder.withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c").create();

    Option exactMatchOpt = obuilder.withLongName("exactMatch")
            .withDescription("If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e").create();

    Option allOpt = obuilder.withLongName("all").withDescription("If set, Select all files. Default is false")
            .withShortName("all").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return 0;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        String catFile = "";
        if (cmdLine.hasOption(categoriesOpt)) {
            catFile = (String) cmdLine.getValue(categoriesOpt);
        }

        boolean all = false;
        if (cmdLine.hasOption(allOpt)) {
            all = true;
        }
        runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return 0;
    }

    return 1;
}