Example usage for org.apache.commons.cli2.commandline Parser Parser

Introduction

In this page you can find the example usage for org.apache.commons.cli2.commandline Parser Parser.

Prototype

Parser

Source Link

Usage

From source file:org.apache.mahout.ga.watchmaker.cd.CDGA.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();

    Option labelOpt = obuilder.withLongName("label").withRequired(true).withShortName("l")
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("label's index.").create();

    Option thresholdOpt = obuilder.withLongName("threshold").withRequired(false).withShortName("t")
            .withArgument(abuilder.withName("threshold").withMinimum(1).withMaximum(1).create())
            .withDescription("Condition activation threshold, default = 0.5.").create();

    Option crosspntsOpt = obuilder.withLongName("crosspnts").withRequired(false).withShortName("cp")
            .withArgument(abuilder.withName("points").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of crossover points to use, default = 1.").create();

    Option mutrateOpt = obuilder.withLongName("mutrate").withRequired(true).withShortName("m")
            .withArgument(abuilder.withName("true").withMinimum(1).withMaximum(1).create())
            .withDescription("Mutation rate (float).").create();

    Option mutrangeOpt = obuilder.withLongName("mutrange").withRequired(false).withShortName("mr")
            .withArgument(abuilder.withName("range").withMinimum(1).withMaximum(1).create())
            .withDescription("Mutation range, default = 0.1 (10%).").create();

    Option mutprecOpt = obuilder.withLongName("mutprec").withRequired(false).withShortName("mp")
            .withArgument(abuilder.withName("precision").withMinimum(1).withMaximum(1).create())
            .withDescription("Mutation precision, default = 2.").create();

    Option popsizeOpt = obuilder.withLongName("popsize").withRequired(true).withShortName("p")
            .withArgument(abuilder.withName("size").withMinimum(1).withMaximum(1).create())
            .withDescription("Population size.").create();

    Option gencntOpt = obuilder.withLongName("gencnt").withRequired(true).withShortName("g")
            .withArgument(abuilder.withName("count").withMinimum(1).withMaximum(1).create())
            .withDescription("Generations count.").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).withOption(labelOpt)
            .withOption(thresholdOpt).withOption(crosspntsOpt).withOption(mutrateOpt).withOption(mutrangeOpt)
            .withOption(mutprecOpt).withOption(popsizeOpt).withOption(gencntOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);//from   www.  ja  v a  2 s. c  om

    try {
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String dataset = cmdLine.getValue(inputOpt).toString();
        int target = Integer.parseInt(cmdLine.getValue(labelOpt).toString());
        double threshold = cmdLine.hasOption(thresholdOpt)
                ? Double.parseDouble(cmdLine.getValue(thresholdOpt).toString())
                : 0.5;
        int crosspnts = cmdLine.hasOption(crosspntsOpt)
                ? Integer.parseInt(cmdLine.getValue(crosspntsOpt).toString())
                : 1;
        double mutrate = Double.parseDouble(cmdLine.getValue(mutrateOpt).toString());
        double mutrange = cmdLine.hasOption(mutrangeOpt)
                ? Double.parseDouble(cmdLine.getValue(mutrangeOpt).toString())
                : 0.1;
        int mutprec = cmdLine.hasOption(mutprecOpt) ? Integer.parseInt(cmdLine.getValue(mutprecOpt).toString())
                : 2;
        int popSize = Integer.parseInt(cmdLine.getValue(popsizeOpt).toString());
        int genCount = Integer.parseInt(cmdLine.getValue(gencntOpt).toString());

        long start = System.currentTimeMillis();

        runJob(dataset, target, threshold, crosspnts, mutrate, mutrange, mutprec, popSize, genCount);

        long end = System.currentTimeMillis();

        printElapsedTime(end - start);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.ga.watchmaker.cd.tool.CDInfosTool.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);/*from   w  ww.ja v  a 2 s. c  o  m*/
    try {
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path input = new Path(cmdLine.getValue(inputOpt).toString());
        Path output = new Path("output"); // TODO surely this should be configurable?

        FileSystem fs = FileSystem.get(input.toUri(), new Configuration());

        log.info("Loading Descriptors...");
        Descriptors descriptors = loadDescriptors(fs, input);

        log.info("Gathering informations...");
        List<String> descriptions = Lists.newArrayList();
        gatherInfos(descriptors, input, output, descriptions);

        log.info("Storing Descriptions...");
        storeDescriptions(fs, input, descriptors, descriptions);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.knn.tools.TestNewsGroupsKMeanLogisticRegression.java

boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFileOption = builder.withLongName("input").withShortName("i").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get test data (encoded with tf-idf)").create();

    Option modelFileOption = builder.withLongName("model").withShortName("m").withRequired(true)
            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
            .withDescription("where to get a model").create();

    Option centroidsFileOption = builder.withLongName("centroids").withShortName("c").withRequired(true)
            .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
            .withDescription("where to get the centroids seqfile").create();

    Option labelFileOption = builder.withLongName("labels").withShortName("l").withRequired(true)
            .withArgument(argumentBuilder.withName("labels").withMaximum(1).create())
            .withDescription("CSV file containing the cluster labels").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFileOption)
            .withOption(modelFileOption).withOption(centroidsFileOption).withOption(labelFileOption).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);//from w  w  w  . j  a  v a 2s.c o  m
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
        return false;
    }

    inputFile = (String) cmdLine.getValue(inputFileOption);
    modelFile = (String) cmdLine.getValue(modelFileOption);
    centroidsFile = (String) cmdLine.getValue(centroidsFileOption);
    labelFile = (String) cmdLine.getValue(labelFileOption);
    return true;
}

From source file:org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob.java

public static Map<String, String> handleArgs(String[] args) {
    Option eigenInputOpt = buildOption("eigenInput", "ei",
            "The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.",
            null);/*www.j  av  a 2  s . co m*/
    Option corpusInputOpt = buildOption("corpusInput", "ci",
            "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>.");
    Option outOpt = DefaultOptionCreator.outputOption().create();
    Option helpOpt = DefaultOptionCreator.helpOption();
    Option inMemOpt = buildOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)",
            "false");
    Option errorOpt = buildOption("maxError", "err", "Maximum acceptable error", "0.05");
    Option minEigenValOpt = buildOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for",
            "0.0");

    GroupBuilder gBuilder = new GroupBuilder().withName("Options").withOption(eigenInputOpt)
            .withOption(corpusInputOpt).withOption(helpOpt).withOption(outOpt).withOption(inMemOpt)
            .withOption(errorOpt).withOption(minEigenValOpt);
    Group group = gBuilder.create();

    Map<String, String> argMap = new HashMap<String, String>();

    CommandLine cmdLine;
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelp(group);
        return null;
    }
    if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return argMap;
    }
    maybePut(argMap, cmdLine, eigenInputOpt, corpusInputOpt, helpOpt, outOpt, inMemOpt, errorOpt,
            minEigenValOpt);
    return argMap;
}

From source file:org.apache.mahout.regression.penalizedlinear.Job.java

private static boolean parseJobArgs(String[] args)
        throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription(// www  .ja v  a  2s . c  om
                    "where to get training data (Mahout sequence file of VectorWritable or white-spaced TEXT file); in each line, the first element is response; rest are predictors.")
            .create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Option convert = builder.withLongName("convert")
            .withDescription(
                    "pre-processing step if the input file is not Mahout sequence files of VectorWritable: "
                            + "converting space-delimited TEXT file containing floating point numbers into "
                            + "Mahout sequence files of VectorWritable suitable for input of Map-Reduce job.")
            .create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).withOption(convert)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    Path input = new Path((String) cmdLine.getValue(inputFile));
    Path output = new Path((String) cmdLine.getValue(outputFile), DIRECTORY_CONTAINING_CONVERTED_INPUT);
    if (cmdLine.hasOption(convert)) {
        jobArgs = new String[args.length - 1];
        int index = 0;
        for (int i = 0; i < args.length; ++i) {
            if (args[i].equals("--convert")) {
                continue;
            }
            jobArgs[index++] = args[i];
            if (args[i].equals("--input")) {
                args[i + 1] = output.toString();
                InputDriver.runJob(input, output, "org.apache.mahout.math.RandomAccessSparseVector");
            }
            if (args[i].equals("--output")) {
                args[i + 1] = (new Path((String) cmdLine.getValue(outputFile), DIRECTORY_CONTAINING_OUTPUT))
                        .toString();
            }
        }
    } else {
        jobArgs = args;
    }
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.LinearCrossValidation.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data (CSV or white-spaced TEXT file)").create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option dependent = builder.withLongName("dependent").withRequired(true)
            .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create())
            .withDescription("the dependent features").create();

    Option independent = builder.withLongName("independent").withRequired(true)
            .withArgument(argumentBuilder.withName("independent").create())
            .withDescription("the independent features").create();

    Option interaction = builder.withLongName("interaction").withRequired(true)
            .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create())
            .withDescription(/*from  w  w  w .  j  av  a  2 s.co m*/
                    "the interactions of features, the format is: feature1:feature2 (identical features are OK)")
            .create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias)
            .withOption(lambda).withOption(alpha).withOption(numOfCV).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new LinearCrossValidationParameter();
    parameter.numOfCV = Integer.parseInt((String) cmdLine.getValue(numOfCV));
    parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha));
    parameter.intercept = cmdLine.hasOption(bias);
    parameter.dependent = (String) cmdLine.getValue(dependent);
    String independentString = "";
    for (Object x : cmdLine.getValues(independent)) {
        independentString += x.toString() + ",";
    }
    parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0));
    String interactionString = "";
    for (Object x : cmdLine.getValues(interaction)) {
        interactionString += x.toString() + ",";
    }
    parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data (CSV or white-spaced TEXT file)").create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option dependent = builder.withLongName("dependent").withRequired(true)
            .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create())
            .withDescription("the dependent features").create();

    Option independent = builder.withLongName("independent").withRequired(true)
            .withArgument(argumentBuilder.withName("independent").create())
            .withDescription("the independent features").create();

    Option interaction = builder.withLongName("interaction").withRequired(true)
            .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create())
            .withDescription(/*ww  w  .j  a va 2s. co  m*/
                    "the interactions of features, the format is: feature1:feature2 (identical features are OK)")
            .create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias)
            .withOption(lambda).withOption(alpha).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new LinearRegularizePathParameter();
    parameter.numOfCV = 1;
    parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha));
    parameter.intercept = cmdLine.hasOption(bias);
    parameter.dependent = (String) cmdLine.getValue(dependent);
    String independentString = "";
    for (Object x : cmdLine.getValues(independent)) {
        independentString += x.toString() + ",";
    }
    parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0));
    String interactionString = "";
    for (Object x : cmdLine.getValues(interaction)) {
        interactionString += x.toString() + ",";
    }
    parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription(// w  w w .  j a  v  a 2 s .co m
                    "where to get training data (Mahout sequence file of VectorWritable); in each line, the first element is response; rest are predictors.")
            .create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new PenalizedLinearParameter();
    parameter.setNumOfCV(Integer.parseInt((String) cmdLine.getValue(numOfCV)));
    parameter.setAlpha(Float.parseFloat((String) cmdLine.getValue(alpha)));
    parameter.setIntercept(cmdLine.hasOption(bias));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numOfCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);

    return true;
}

From source file:org.apache.mahout.text.SparseVectorsFromSequenceFiles.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("input dir containing the documents in sequence file format").withShortName("i")
            .create();/* www.j a  v  a2  s  .  c om*/

    Option outputDirOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();
    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.")
            .withShortName("x").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();
    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();
    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
            .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
            .withOption(helpOpt).withOption(sequentialAccessVectorOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.overwriteOutput(outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            analyzerClass.newInstance();
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if (wString.equalsIgnoreCase("tf")) {
                processIdf = false;
            } else if (wString.equalsIgnoreCase("tfidf")) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if (power.equals("INF")) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }
        HadoopUtil.overwriteOutput(outputDir);
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize,
                minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent,
                    norm, sequentialAccessOutput, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();// www  .j  av  a 2s .  co m

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);

    // If the specified path for the input file is incorrect, return immediately
    if (!dumpFile.exists()) {
        log.error("Input file path {} doesn't exist", dumpFilePath);
        return;
    }

    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int fileNumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                fileNumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
                try {
                    chunkWriter.write(content.toString(), 0, content.length());
                } finally {
                    Closeables.close(chunkWriter, false);
                }
                if (fileNumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}