Example usage for org.apache.commons.cli2.commandline Parser Parser

List of usage examples for org.apache.commons.cli2.commandline Parser Parser

Introduction

In this page you can find the example usage for org.apache.commons.cli2.commandline Parser Parser.

Prototype

Parser

Source Link

Usage

From source file:org.apache.mahout.ga.watchmaker.cd.CDGA.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();

    Option labelOpt = obuilder.withLongName("label").withRequired(true).withShortName("l")
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("label's index.").create();

    Option thresholdOpt = obuilder.withLongName("threshold").withRequired(false).withShortName("t")
            .withArgument(abuilder.withName("threshold").withMinimum(1).withMaximum(1).create())
            .withDescription("Condition activation threshold, default = 0.5.").create();

    Option crosspntsOpt = obuilder.withLongName("crosspnts").withRequired(false).withShortName("cp")
            .withArgument(abuilder.withName("points").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of crossover points to use, default = 1.").create();

    Option mutrateOpt = obuilder.withLongName("mutrate").withRequired(true).withShortName("m")
            .withArgument(abuilder.withName("true").withMinimum(1).withMaximum(1).create())
            .withDescription("Mutation rate (float).").create();

    Option mutrangeOpt = obuilder.withLongName("mutrange").withRequired(false).withShortName("mr")
            .withArgument(abuilder.withName("range").withMinimum(1).withMaximum(1).create())
            .withDescription("Mutation range, default = 0.1 (10%).").create();

    Option mutprecOpt = obuilder.withLongName("mutprec").withRequired(false).withShortName("mp")
            .withArgument(abuilder.withName("precision").withMinimum(1).withMaximum(1).create())
            .withDescription("Mutation precision, default = 2.").create();

    Option popsizeOpt = obuilder.withLongName("popsize").withRequired(true).withShortName("p")
            .withArgument(abuilder.withName("size").withMinimum(1).withMaximum(1).create())
            .withDescription("Population size.").create();

    Option gencntOpt = obuilder.withLongName("gencnt").withRequired(true).withShortName("g")
            .withArgument(abuilder.withName("count").withMinimum(1).withMaximum(1).create())
            .withDescription("Generations count.").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).withOption(labelOpt)
            .withOption(thresholdOpt).withOption(crosspntsOpt).withOption(mutrateOpt).withOption(mutrangeOpt)
            .withOption(mutprecOpt).withOption(popsizeOpt).withOption(gencntOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);//from   www.  ja  v a  2 s. c  om

    try {
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String dataset = cmdLine.getValue(inputOpt).toString();
        int target = Integer.parseInt(cmdLine.getValue(labelOpt).toString());
        double threshold = cmdLine.hasOption(thresholdOpt)
                ? Double.parseDouble(cmdLine.getValue(thresholdOpt).toString())
                : 0.5;
        int crosspnts = cmdLine.hasOption(crosspntsOpt)
                ? Integer.parseInt(cmdLine.getValue(crosspntsOpt).toString())
                : 1;
        double mutrate = Double.parseDouble(cmdLine.getValue(mutrateOpt).toString());
        double mutrange = cmdLine.hasOption(mutrangeOpt)
                ? Double.parseDouble(cmdLine.getValue(mutrangeOpt).toString())
                : 0.1;
        int mutprec = cmdLine.hasOption(mutprecOpt) ? Integer.parseInt(cmdLine.getValue(mutprecOpt).toString())
                : 2;
        int popSize = Integer.parseInt(cmdLine.getValue(popsizeOpt).toString());
        int genCount = Integer.parseInt(cmdLine.getValue(gencntOpt).toString());

        long start = System.currentTimeMillis();

        runJob(dataset, target, threshold, crosspnts, mutrate, mutrange, mutprec, popSize, genCount);

        long end = System.currentTimeMillis();

        printElapsedTime(end - start);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.ga.watchmaker.cd.tool.CDInfosTool.java

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);/*from   w  ww.ja v  a 2 s. c  o  m*/
    try {
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path input = new Path(cmdLine.getValue(inputOpt).toString());
        Path output = new Path("output"); // TODO surely this should be configurable?

        FileSystem fs = FileSystem.get(input.toUri(), new Configuration());

        log.info("Loading Descriptors...");
        Descriptors descriptors = loadDescriptors(fs, input);

        log.info("Gathering informations...");
        List<String> descriptions = Lists.newArrayList();
        gatherInfos(descriptors, input, output, descriptions);

        log.info("Storing Descriptions...");
        storeDescriptions(fs, input, descriptors, descriptions);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.knn.tools.TestNewsGroupsKMeanLogisticRegression.java

boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFileOption = builder.withLongName("input").withShortName("i").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get test data (encoded with tf-idf)").create();

    Option modelFileOption = builder.withLongName("model").withShortName("m").withRequired(true)
            .withArgument(argumentBuilder.withName("model").withMaximum(1).create())
            .withDescription("where to get a model").create();

    Option centroidsFileOption = builder.withLongName("centroids").withShortName("c").withRequired(true)
            .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create())
            .withDescription("where to get the centroids seqfile").create();

    Option labelFileOption = builder.withLongName("labels").withShortName("l").withRequired(true)
            .withArgument(argumentBuilder.withName("labels").withMaximum(1).create())
            .withDescription("CSV file containing the cluster labels").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFileOption)
            .withOption(modelFileOption).withOption(centroidsFileOption).withOption(labelFileOption).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);//from w  w  w  . j  a  v a 2s.c o  m
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);

    if (cmdLine == null) {
        return false;
    }

    inputFile = (String) cmdLine.getValue(inputFileOption);
    modelFile = (String) cmdLine.getValue(modelFileOption);
    centroidsFile = (String) cmdLine.getValue(centroidsFileOption);
    labelFile = (String) cmdLine.getValue(labelFileOption);
    return true;
}

From source file:org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob.java

public static Map<String, String> handleArgs(String[] args) {
    Option eigenInputOpt = buildOption("eigenInput", "ei",
            "The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.",
            null);/*www.j  av  a 2  s . co m*/
    Option corpusInputOpt = buildOption("corpusInput", "ci",
            "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>.");
    Option outOpt = DefaultOptionCreator.outputOption().create();
    Option helpOpt = DefaultOptionCreator.helpOption();
    Option inMemOpt = buildOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)",
            "false");
    Option errorOpt = buildOption("maxError", "err", "Maximum acceptable error", "0.05");
    Option minEigenValOpt = buildOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for",
            "0.0");

    GroupBuilder gBuilder = new GroupBuilder().withName("Options").withOption(eigenInputOpt)
            .withOption(corpusInputOpt).withOption(helpOpt).withOption(outOpt).withOption(inMemOpt)
            .withOption(errorOpt).withOption(minEigenValOpt);
    Group group = gBuilder.create();

    Map<String, String> argMap = new HashMap<String, String>();

    CommandLine cmdLine;
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelp(group);
        return null;
    }
    if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return argMap;
    }
    maybePut(argMap, cmdLine, eigenInputOpt, corpusInputOpt, helpOpt, outOpt, inMemOpt, errorOpt,
            minEigenValOpt);
    return argMap;
}

From source file:org.apache.mahout.regression.penalizedlinear.Job.java

private static boolean parseJobArgs(String[] args)
        throws IOException, InterruptedException, ClassNotFoundException {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription(// www  .ja v  a  2s . c  om
                    "where to get training data (Mahout sequence file of VectorWritable or white-spaced TEXT file); in each line, the first element is response; rest are predictors.")
            .create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Option convert = builder.withLongName("convert")
            .withDescription(
                    "pre-processing step if the input file is not Mahout sequence files of VectorWritable: "
                            + "converting space-delimited TEXT file containing floating point numbers into "
                            + "Mahout sequence files of VectorWritable suitable for input of Map-Reduce job.")
            .create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).withOption(convert)
            .create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    Path input = new Path((String) cmdLine.getValue(inputFile));
    Path output = new Path((String) cmdLine.getValue(outputFile), DIRECTORY_CONTAINING_CONVERTED_INPUT);
    if (cmdLine.hasOption(convert)) {
        jobArgs = new String[args.length - 1];
        int index = 0;
        for (int i = 0; i < args.length; ++i) {
            if (args[i].equals("--convert")) {
                continue;
            }
            jobArgs[index++] = args[i];
            if (args[i].equals("--input")) {
                args[i + 1] = output.toString();
                InputDriver.runJob(input, output, "org.apache.mahout.math.RandomAccessSparseVector");
            }
            if (args[i].equals("--output")) {
                args[i + 1] = (new Path((String) cmdLine.getValue(outputFile), DIRECTORY_CONTAINING_OUTPUT))
                        .toString();
            }
        }
    } else {
        jobArgs = args;
    }
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.LinearCrossValidation.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data (CSV or white-spaced TEXT file)").create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option dependent = builder.withLongName("dependent").withRequired(true)
            .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create())
            .withDescription("the dependent features").create();

    Option independent = builder.withLongName("independent").withRequired(true)
            .withArgument(argumentBuilder.withName("independent").create())
            .withDescription("the independent features").create();

    Option interaction = builder.withLongName("interaction").withRequired(true)
            .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create())
            .withDescription(/*from  w  w  w .  j  av  a  2 s.co m*/
                    "the interactions of features, the format is: feature1:feature2 (identical features are OK)")
            .create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias)
            .withOption(lambda).withOption(alpha).withOption(numOfCV).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new LinearCrossValidationParameter();
    parameter.numOfCV = Integer.parseInt((String) cmdLine.getValue(numOfCV));
    parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha));
    parameter.intercept = cmdLine.hasOption(bias);
    parameter.dependent = (String) cmdLine.getValue(dependent);
    String independentString = "";
    for (Object x : cmdLine.getValues(independent)) {
        independentString += x.toString() + ",";
    }
    parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0));
    String interactionString = "";
    for (Object x : cmdLine.getValues(interaction)) {
        interactionString += x.toString() + ",";
    }
    parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription("where to get training data (CSV or white-spaced TEXT file)").create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option dependent = builder.withLongName("dependent").withRequired(true)
            .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create())
            .withDescription("the dependent features").create();

    Option independent = builder.withLongName("independent").withRequired(true)
            .withArgument(argumentBuilder.withName("independent").create())
            .withDescription("the independent features").create();

    Option interaction = builder.withLongName("interaction").withRequired(true)
            .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create())
            .withDescription(/*ww  w  .j  a va 2s. co  m*/
                    "the interactions of features, the format is: feature1:feature2 (identical features are OK)")
            .create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias)
            .withOption(lambda).withOption(alpha).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new LinearRegularizePathParameter();
    parameter.numOfCV = 1;
    parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha));
    parameter.intercept = cmdLine.hasOption(bias);
    parameter.dependent = (String) cmdLine.getValue(dependent);
    String independentString = "";
    for (Object x : cmdLine.getValues(independent)) {
        independentString += x.toString() + ",";
    }
    parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0));
    String interactionString = "";
    for (Object x : cmdLine.getValues(interaction)) {
        interactionString += x.toString() + ",";
    }
    parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);
    return true;
}

From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java

private boolean parseArgs(String[] args) {
    DefaultOptionBuilder builder = new DefaultOptionBuilder();

    Option help = builder.withLongName("help").withDescription("print this list").create();

    ArgumentBuilder argumentBuilder = new ArgumentBuilder();
    Option inputFile = builder.withLongName("input").withRequired(true)
            .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
            .withDescription(// w  w w .  j a  v  a 2 s .co m
                    "where to get training data (Mahout sequence file of VectorWritable); in each line, the first element is response; rest are predictors.")
            .create();

    Option outputFile = builder.withLongName("output").withRequired(true)
            .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
            .withDescription("where to get results").create();

    Option lambda = builder.withLongName("lambda")
            .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create())
            .withDescription("an increasing positive sequence of penalty coefficient, "
                    + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.")
            .create();

    Option alpha = builder.withLongName("alpha")
            .withArgument(
                    argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create())
            .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create();

    Option bias = builder.withLongName("bias").withDescription("include a bias term").create();

    Option numOfCV = builder.withLongName("numOfCV")
            .withArgument(
                    argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create())
            .withDescription("number of cross validation, the rule of thumb is 5 or 10").create();

    Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile)
            .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).create();

    Parser parser = new Parser();
    parser.setHelpOption(help);
    parser.setHelpTrigger("--help");
    parser.setGroup(normalArgs);
    parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
    CommandLine cmdLine = parser.parseAndHelp(args);
    if (cmdLine == null) {
        return false;
    }

    parameter = new PenalizedLinearParameter();
    parameter.setNumOfCV(Integer.parseInt((String) cmdLine.getValue(numOfCV)));
    parameter.setAlpha(Float.parseFloat((String) cmdLine.getValue(alpha)));
    parameter.setIntercept(cmdLine.hasOption(bias));

    if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0
            || parameter.numOfCV < 1 || parameter.numOfCV > 20) {
        log.error(
                "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numOfCV <= 20");
        return false;
    }

    input = (String) cmdLine.getValue(inputFile);
    output = (String) cmdLine.getValue(outputFile);

    return true;
}

From source file:org.apache.mahout.text.SparseVectorsFromSequenceFiles.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("input dir containing the documents in sequence file format").withShortName("i")
            .create();/* www.j a  v  a2  s  .  c om*/

    Option outputDirOpt = obuilder.withLongName("output").withRequired(true)
            .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();
    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.")
            .withShortName("x").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();
    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();
    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
            .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
            .withOption(helpOpt).withOption(sequentialAccessVectorOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.overwriteOutput(outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            analyzerClass.newInstance();
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if (wString.equalsIgnoreCase("tf")) {
                processIdf = false;
            } else if (wString.equalsIgnoreCase("tfidf")) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if (power.equals("INF")) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }
        HadoopUtil.overwriteOutput(outputDir);
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize,
                minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent,
                    norm, sequentialAccessOutput, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();// www  .j  av  a 2s .  co m

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);

    // If the specified path for the input file is incorrect, return immediately
    if (!dumpFile.exists()) {
        log.error("Input file path {} doesn't exist", dumpFilePath);
        return;
    }

    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int fileNumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                fileNumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
                try {
                    chunkWriter.write(content.toString(), 0, content.length());
                } finally {
                    Closeables.close(chunkWriter, false);
                }
                if (fileNumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}