Example usage for org.apache.commons.cli2 CommandLine hasOption

List of usage examples for org.apache.commons.cli2 CommandLine hasOption

Introduction

In this page you can find the example usage for org.apache.commons.cli2 CommandLine hasOption.

Prototype

boolean hasOption(final Option option);

Source Link

Document

Detects the presence of an option in this CommandLine.

Usage

From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java

private boolean processLambda(LinearRegularizePathParameter parameter, CommandLine cmdLine, Option lambda) {
    String lambdaSeq = "";
    double previous = Double.NEGATIVE_INFINITY;
    if (cmdLine.hasOption(lambda)) {
        for (Object x : cmdLine.getValues(lambda)) {
            double number = Double.parseDouble(x.toString());
            if (previous >= number || number < 0) {
                return false;
            }//ww w. j a  v a  2s. c  o m
            lambdaSeq += x.toString() + ",";
            previous = number;
        }
        parameter.lambda = lambdaSeq.substring(0, lambdaSeq.length() - 1);
        return true;
    } else {
        parameter.lambda = "";
        return true;
    }
}

From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription(//from  ww w.j  a  v  a2 s.  c om
                    "where to get training data (Mahout sequence file of VectorWritable); in each line, the first element is response; rest are predictors.")
            .create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new PenalizedLinearParameter();
    parameter.setNumOfCV(Integer.parseInt((String) cmdLine.getValue(numOfCV)));
    parameter.setAlpha(Float.parseFloat((String) cmdLine.getValue(alpha)));
    parameter.setIntercept(cmdLine.hasOption(bias));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numOfCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);

    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java

boolean processLambda(PenalizedLinearParameter parameter, CommandLine cmdLine, Option lambda) {
    StringBuilder lambdaSeq = new StringBuilder();
    double previous = Double.NEGATIVE_INFINITY;
    if (cmdLine.hasOption(lambda)) {
        for (Object x : cmdLine.getValues(lambda)) {
            double number = Double.parseDouble(x.toString());
            if (previous >= number || number < 0) {
                return false;
            }/*  ww  w . j  ava  2s.  c  om*/
            lambdaSeq.append(x.toString()).append(",");
            previous = number;
        }
        parameter.setLambda(lambdaSeq.substring(0, lambdaSeq.length() - 1));
        return true;
    } else {
        parameter.setLambda("");
        return true;
    }
}

From source file:org.apache.mahout.text.SparseVectorsFromSequenceFiles.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("input dir containing the documents in sequence file format").withShortName("i")
            .create();//from w ww  . j a  v  a2s.c om

    Option outputDirOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();
    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.")
            .withShortName("x").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();
    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();
    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
            .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
            .withOption(helpOpt).withOption(sequentialAccessVectorOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.overwriteOutput(outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            analyzerClass.newInstance();
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if (wString.equalsIgnoreCase("tf")) {
                processIdf = false;
            } else if (wString.equalsIgnoreCase("tfidf")) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if (power.equals("INF")) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }
        HadoopUtil.overwriteOutput(outputDir);
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize,
                minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent,
                    norm, sequentialAccessOutput, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();/*w  w w .  j av a  2s  . c o  m*/

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);

    // If the specified path for the input file is incorrect, return immediately
    if (!dumpFile.exists()) {
        log.error("Input file path {} doesn't exist", dumpFilePath);
        return;
    }

    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int fileNumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                fileNumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
                try {
                    chunkWriter.write(content.toString(), 0, content.length());
                } finally {
                    Closeables.close(chunkWriter, false);
                }
                if (fileNumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}

From source file:org.apache.mahout.text.WikipediaToSequenceFile.java

/**
 * Takes in two arguments:/*w  ww .java2s .  c  o  m*/
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
 * {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 */
public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt = obuilder.withLongName("categories")
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c").create();

    Option exactMatchOpt = obuilder.withLongName("exactMatch")
            .withDescription("If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e").create();

    Option allOpt = obuilder.withLongName("all").withDescription("If set, Select all files. Default is false")
            .withShortName("all").create();

    Option removeLabelOpt = obuilder.withLongName("removeLabels")
            .withDescription("If set, remove [[Category:labels]] from document text after extracting label."
                    + "Default is false")
            .withShortName("rl").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
            .withOption(removeLabelOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    parser.setHelpOption(helpOpt);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);

        String catFile = "";
        if (cmdLine.hasOption(categoriesOpt)) {
            catFile = (String) cmdLine.getValue(categoriesOpt);
        }

        boolean all = false;
        if (cmdLine.hasOption(allOpt)) {
            all = true;
        }

        boolean removeLabels = false;
        if (cmdLine.hasOption(removeLabelOpt)) {
            removeLabels = true;
        }

        runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (InterruptedException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The Path for input files.").withShortName("i").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The Path write output to").withShortName("o").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:2")
            .withShortName("ng").create();

    Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false)
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT)
            .withShortName("s").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers").withRequired(false)
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription(//from  ww  w .j a  va 2 s.c om
                    "(Optional) Number of reduce tasks. Default Value: " + DEFAULT_PASS1_NUM_REDUCE_TASKS)
            .withShortName("nr").create();

    Option preprocessOpt = obuilder.withLongName("preprocess").withRequired(false)
            .withDescription("If set, input is SequenceFile<Text,Text> where the value is the document, "
                    + " which will be tokenized using the specified analyzer.")
            .withShortName("p").create();

    Option unigramOpt = obuilder.withLongName("unigram").withRequired(false)
            .withDescription("If set, unigrams will be emitted in the final output alongside collocations")
            .withShortName("u").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("w").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt)
            .withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(analyzerNameOpt)
            .withOption(preprocessOpt).withOption(unigramOpt).withOption(helpOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return 1;
        }

        Path input = new Path(cmdLine.getValue(inputOpt).toString());
        Path output = new Path(cmdLine.getValue(outputOpt).toString());

        int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.overwriteOutput(output);
        }

        int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
        if (cmdLine.hasOption(minSupportOpt)) {
            minSupport = Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
        }
        log.info("Minimum Support value: {}", minSupport);

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of pass1 reduce tasks: {}", reduceTasks);

        boolean emitUnigrams = cmdLine.hasOption(unigramOpt);

        if (cmdLine.hasOption(preprocessOpt)) {
            log.info("Input will be preprocessed");

            Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
            if (cmdLine.hasOption(analyzerNameOpt)) {
                String className = cmdLine.getValue(analyzerNameOpt).toString();
                analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
                // try instantiating it, b/c there isn't any point in setting it if
                // you can't instantiate it
                analyzerClass.newInstance();
            }

            Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

            DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
            input = tokenizedPath;
        } else {
            log.info("Input will NOT be preprocessed");
        }

        // parse input and extract collocations
        long ngramCount = generateCollocations(input, output, emitUnigrams, maxNGramSize, reduceTasks,
                minSupport);

        // tally collocations and perform LLR calculation
        computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue, reduceTasks);

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return 1;
    }

    return 0;
}

From source file:org.apache.mahout.utils.vectors.arff.Driver.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(/*from   w ww .  ja v a 2  s  .  c o m*/
                    "The file or directory containing the ARFF files.  If it is a directory, all .arff files will be converted")
            .withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The output directory.  Files will have the same name as the input, but with the extension .mvc")
            .withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true)
            .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The file to output the label bindings").withShortName("t").create();

    Option jsonDictonaryOpt = obuilder.withLongName("json-dictonary").withRequired(false)
            .withDescription("Write dictonary in JSON format").withShortName("j").create();

    Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false)
            .withArgument(abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create())
            .withDescription("The delimiter for outputing the dictionary").withShortName("l").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(helpOpt).withOption(dictOutOpt).withOption(jsonDictonaryOpt).withOption(delimiterOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {

            CommandLineUtil.printHelp(group);
            return;
        }
        if (cmdLine.hasOption(inputOpt)) { // Lucene case
            File input = new File(cmdLine.getValue(inputOpt).toString());
            long maxDocs = Long.MAX_VALUE;
            if (cmdLine.hasOption(maxOpt)) {
                maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
            }
            if (maxDocs < 0) {
                throw new IllegalArgumentException("maxDocs must be >= 0");
            }
            String outDir = cmdLine.getValue(outputOpt).toString();
            log.info("Output Dir: {}", outDir);

            String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString()
                    : "\t";
            File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
            boolean jsonDictonary = cmdLine.hasOption(jsonDictonaryOpt);
            ARFFModel model = new MapBackedARFFModel();
            if (input.exists() && input.isDirectory()) {
                File[] files = input.listFiles(new FilenameFilter() {
                    @Override
                    public boolean accept(File file, String name) {
                        return name.endsWith(".arff");
                    }
                });

                for (File file : files) {
                    writeFile(outDir, file, maxDocs, model, dictOut, delimiter, jsonDictonary);
                }
            } else {
                writeFile(outDir, input, maxDocs, model, dictOut, delimiter, jsonDictonary);
            }
        }

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.utils.vectors.libsvm.Driver.java

/**
 * The main method./*from w  w w  .j a  v  a  2s. com*/
 * 
 * @param args the arguments
 * @throws IOException Signals that an I/O exception has occurred.
 */
public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The file or directory containing the ARFF files.  If it is a directory, all .arff files will be converted")
            .withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The output directory.  Files will have the same name as the input, but with the extension .mvc")
            .withShortName("o").create();

    Option maxOpt = obuilder.withLongName("max").withRequired(false)
            .withArgument(abuilder.withName("max").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of vectors to output.  If not specified, then it will loop over all docs")
            .withShortName("m").create();

    Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true)
            .withArgument(abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create())
            .withDescription("The file to output the label bindings").withShortName("t").create();

    Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false)
            .withArgument(abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)")
            .withShortName("e").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(maxOpt)
            .withOption(helpOpt).withOption(dictOutOpt).withOption(outWriterOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {

            CommandLineUtil.printHelp(group);
            return;
        }

        if (cmdLine.hasOption(inputOpt)) {// Lucene case
            File input = new File(cmdLine.getValue(inputOpt).toString());
            long maxDocs = Long.MAX_VALUE;
            if (cmdLine.hasOption(maxOpt)) {
                maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
            }
            if (maxDocs < 0) {
                throw new IllegalArgumentException("maxDocs must be >= 0");
            }
            String outDir = cmdLine.getValue(outputOpt).toString();
            Driver.log.info("Output Dir: {}", outDir);
            String outWriter = null;
            if (cmdLine.hasOption(outWriterOpt)) {
                outWriter = cmdLine.getValue(outWriterOpt).toString();
            }
            File dictOut = new File(cmdLine.getValue(dictOutOpt).toString());
            List<Double> labels = new ArrayList<Double>();
            if (input.exists() && input.isDirectory()) {
                File[] files = input.listFiles();

                for (File file : files) {
                    //            Driver.writeFile(outWriter, outDir, file, maxDocs, labels);
                }
            } else {
                //          Driver.writeFile(outWriter, outDir, input, maxDocs, labels);
            }
            Driver.log.info("Dictionary Output file: {}", dictOut);
            BufferedWriter writer = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(dictOut), Charset.forName("UTF8")));
            for (Double label : labels) {
                writer.append(label.toString()).append('\n');
            }
            writer.close();

        }

    } catch (OptionException e) {
        Driver.log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

public static void main(String[] args) {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option indexOpt = obuilder.withLongName("dir").withRequired(true)
            .withArgument(abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
            .withDescription("The Lucene index directory").withShortName("d").create();

    Option outputOpt = obuilder.withLongName("output").withRequired(false)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output file. If not specified, the result is printed on console.")
            .withShortName("o").create();

    Option fieldOpt = obuilder.withLongName("field").withRequired(true)
            .withArgument(abuilder.withName("field").withMinimum(1).withMaximum(1).create())
            .withDescription("The content field in the index").withShortName("f").create();

    Option idFieldOpt = obuilder.withLongName("idField").withRequired(false)
            .withArgument(abuilder.withName("idField").withMinimum(1).withMaximum(1).create())
            .withDescription(//from  w  w w .ja va 2 s  .  co m
                    "The field for the document ID in the index.  If null, then the Lucene internal doc "
                            + "id is used which is prone to error if the underlying index changes")
            .withShortName("i").create();

    Option seqOpt = obuilder.withLongName("seqFileDir").withRequired(true)
            .withArgument(abuilder.withName("seqFileDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The directory containing Sequence Files for the Clusters").withShortName("s")
            .create();

    Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(true)
            .withArgument(abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The directory containing points sequence files mapping input vectors to their cluster.  ")
            .withShortName("p").create();
    Option minClusterSizeOpt = obuilder.withLongName("minClusterSize").withRequired(false)
            .withArgument(abuilder.withName("minClusterSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum number of points required in a cluster to print the labels for")
            .withShortName("m").create();
    Option maxLabelsOpt = obuilder.withLongName("maxLabels").withRequired(false)
            .withArgument(abuilder.withName("maxLabels").withMinimum(1).withMaximum(1).create())
            .withDescription("The maximum number of labels to print per cluster").withShortName("x").create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(indexOpt).withOption(idFieldOpt).withOption(outputOpt)
            .withOption(fieldOpt).withOption(seqOpt).withOption(pointsOpt).withOption(helpOpt)
            .withOption(maxLabelsOpt).withOption(minClusterSizeOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path seqFileDir = new Path(cmdLine.getValue(seqOpt).toString());
        Path pointsDir = new Path(cmdLine.getValue(pointsOpt).toString());
        String indexDir = cmdLine.getValue(indexOpt).toString();
        String contentField = cmdLine.getValue(fieldOpt).toString();

        String idField = null;

        if (cmdLine.hasOption(idFieldOpt)) {
            idField = cmdLine.getValue(idFieldOpt).toString();
        }
        String output = null;
        if (cmdLine.hasOption(outputOpt)) {
            output = cmdLine.getValue(outputOpt).toString();
        }
        int maxLabels = DEFAULT_MAX_LABELS;
        if (cmdLine.hasOption(maxLabelsOpt)) {
            maxLabels = Integer.parseInt(cmdLine.getValue(maxLabelsOpt).toString());
        }
        int minSize = DEFAULT_MIN_IDS;
        if (cmdLine.hasOption(minClusterSizeOpt)) {
            minSize = Integer.parseInt(cmdLine.getValue(minClusterSizeOpt).toString());
        }
        ClusterLabels clusterLabel = new ClusterLabels(seqFileDir, pointsDir, indexDir, contentField, minSize,
                maxLabels);

        if (idField != null) {
            clusterLabel.setIdField(idField);
        }
        if (output != null) {
            clusterLabel.setOutput(output);
        }

        clusterLabel.getLabels();

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("Exception", e);
    }
}