List of usage examples for org.apache.commons.cli2.commandline Parser Parser
Parser
From source file:org.apache.mahout.ga.watchmaker.cd.CDGA.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option labelOpt = obuilder.withLongName("label").withRequired(true).withShortName("l") .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("label's index.").create(); Option thresholdOpt = obuilder.withLongName("threshold").withRequired(false).withShortName("t") .withArgument(abuilder.withName("threshold").withMinimum(1).withMaximum(1).create()) .withDescription("Condition activation threshold, default = 0.5.").create(); Option crosspntsOpt = obuilder.withLongName("crosspnts").withRequired(false).withShortName("cp") .withArgument(abuilder.withName("points").withMinimum(1).withMaximum(1).create()) .withDescription("Number of crossover points to use, default = 1.").create(); Option mutrateOpt = obuilder.withLongName("mutrate").withRequired(true).withShortName("m") .withArgument(abuilder.withName("true").withMinimum(1).withMaximum(1).create()) .withDescription("Mutation rate (float).").create(); Option mutrangeOpt = obuilder.withLongName("mutrange").withRequired(false).withShortName("mr") .withArgument(abuilder.withName("range").withMinimum(1).withMaximum(1).create()) .withDescription("Mutation range, default = 0.1 (10%).").create(); Option mutprecOpt = obuilder.withLongName("mutprec").withRequired(false).withShortName("mp") .withArgument(abuilder.withName("precision").withMinimum(1).withMaximum(1).create()) .withDescription("Mutation precision, default = 2.").create(); Option popsizeOpt = obuilder.withLongName("popsize").withRequired(true).withShortName("p") .withArgument(abuilder.withName("size").withMinimum(1).withMaximum(1).create()) .withDescription("Population size.").create(); Option gencntOpt = obuilder.withLongName("gencnt").withRequired(true).withShortName("g") .withArgument(abuilder.withName("count").withMinimum(1).withMaximum(1).create()) .withDescription("Generations count.").create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).withOption(labelOpt) .withOption(thresholdOpt).withOption(crosspntsOpt).withOption(mutrateOpt).withOption(mutrangeOpt) .withOption(mutprecOpt).withOption(popsizeOpt).withOption(gencntOpt).create(); Parser parser = new Parser(); parser.setGroup(group);//from www. ja v a 2 s. c om try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String dataset = cmdLine.getValue(inputOpt).toString(); int target = Integer.parseInt(cmdLine.getValue(labelOpt).toString()); double threshold = cmdLine.hasOption(thresholdOpt) ? Double.parseDouble(cmdLine.getValue(thresholdOpt).toString()) : 0.5; int crosspnts = cmdLine.hasOption(crosspntsOpt) ? Integer.parseInt(cmdLine.getValue(crosspntsOpt).toString()) : 1; double mutrate = Double.parseDouble(cmdLine.getValue(mutrateOpt).toString()); double mutrange = cmdLine.hasOption(mutrangeOpt) ? Double.parseDouble(cmdLine.getValue(mutrangeOpt).toString()) : 0.1; int mutprec = cmdLine.hasOption(mutprecOpt) ? Integer.parseInt(cmdLine.getValue(mutprecOpt).toString()) : 2; int popSize = Integer.parseInt(cmdLine.getValue(popsizeOpt).toString()); int genCount = Integer.parseInt(cmdLine.getValue(gencntOpt).toString()); long start = System.currentTimeMillis(); runJob(dataset, target, threshold, crosspnts, mutrate, mutrange, mutprec, popSize, genCount); long end = System.currentTimeMillis(); printElapsedTime(end - start); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.ga.watchmaker.cd.tool.CDInfosTool.java
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create(); Parser parser = new Parser(); parser.setGroup(group);/*from w ww.ja v a 2 s. c o m*/ try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } Path input = new Path(cmdLine.getValue(inputOpt).toString()); Path output = new Path("output"); // TODO surely this should be configurable? FileSystem fs = FileSystem.get(input.toUri(), new Configuration()); log.info("Loading Descriptors..."); Descriptors descriptors = loadDescriptors(fs, input); log.info("Gathering informations..."); List<String> descriptions = Lists.newArrayList(); gatherInfos(descriptors, input, output, descriptions); log.info("Storing Descriptions..."); storeDescriptions(fs, input, descriptors, descriptions); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.knn.tools.TestNewsGroupsKMeanLogisticRegression.java
boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFileOption = builder.withLongName("input").withShortName("i").withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get test data (encoded with tf-idf)").create(); Option modelFileOption = builder.withLongName("model").withShortName("m").withRequired(true) .withArgument(argumentBuilder.withName("model").withMaximum(1).create()) .withDescription("where to get a model").create(); Option centroidsFileOption = builder.withLongName("centroids").withShortName("c").withRequired(true) .withArgument(argumentBuilder.withName("centroids").withMaximum(1).create()) .withDescription("where to get the centroids seqfile").create(); Option labelFileOption = builder.withLongName("labels").withShortName("l").withRequired(true) .withArgument(argumentBuilder.withName("labels").withMaximum(1).create()) .withDescription("CSV file containing the cluster labels").create(); Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFileOption) .withOption(modelFileOption).withOption(centroidsFileOption).withOption(labelFileOption).create(); Parser parser = new Parser(); parser.setHelpOption(help);//from w w w . j a v a 2s.c o m parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } inputFile = (String) cmdLine.getValue(inputFileOption); modelFile = (String) cmdLine.getValue(modelFileOption); centroidsFile = (String) cmdLine.getValue(centroidsFileOption); labelFile = (String) cmdLine.getValue(labelFileOption); return true; }
From source file:org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob.java
public static Map<String, String> handleArgs(String[] args) { Option eigenInputOpt = buildOption("eigenInput", "ei", "The Path for purported eigenVector input files (SequenceFile<WritableComparable,VectorWritable>.", null);/*www.j av a 2 s . co m*/ Option corpusInputOpt = buildOption("corpusInput", "ci", "The Path for corpus input files (SequenceFile<WritableComparable,VectorWritable>."); Option outOpt = DefaultOptionCreator.outputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inMemOpt = buildOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have enough!)", "false"); Option errorOpt = buildOption("maxError", "err", "Maximum acceptable error", "0.05"); Option minEigenValOpt = buildOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector for", "0.0"); GroupBuilder gBuilder = new GroupBuilder().withName("Options").withOption(eigenInputOpt) .withOption(corpusInputOpt).withOption(helpOpt).withOption(outOpt).withOption(inMemOpt) .withOption(errorOpt).withOption(minEigenValOpt); Group group = gBuilder.create(); Map<String, String> argMap = new HashMap<String, String>(); CommandLine cmdLine; try { Parser parser = new Parser(); parser.setGroup(group); cmdLine = parser.parse(args); } catch (OptionException e) { log.error(e.getMessage()); CommandLineUtil.printHelp(group); return null; } if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return argMap; } maybePut(argMap, cmdLine, eigenInputOpt, corpusInputOpt, helpOpt, outOpt, inMemOpt, errorOpt, minEigenValOpt); return argMap; }
From source file:org.apache.mahout.regression.penalizedlinear.Job.java
private static boolean parseJobArgs(String[] args) throws IOException, InterruptedException, ClassNotFoundException { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFile = builder.withLongName("input").withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription(// www .ja v a 2s . c om "where to get training data (Mahout sequence file of VectorWritable or white-spaced TEXT file); in each line, the first element is response; rest are predictors.") .create(); Option outputFile = builder.withLongName("output").withRequired(true) .withArgument(argumentBuilder.withName("output").withMaximum(1).create()) .withDescription("where to get results").create(); Option lambda = builder.withLongName("lambda") .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create()) .withDescription("an increasing positive sequence of penalty coefficient, " + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.") .create(); Option alpha = builder.withLongName("alpha") .withArgument( argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create()) .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create(); Option bias = builder.withLongName("bias").withDescription("include a bias term").create(); Option numOfCV = builder.withLongName("numOfCV") .withArgument( argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create()) .withDescription("number of cross validation, the rule of thumb is 5 or 10").create(); Option convert = builder.withLongName("convert") .withDescription( "pre-processing step if the input file is not Mahout sequence files of VectorWritable: " + "converting space-delimited TEXT file containing floating point numbers into " + "Mahout sequence files of VectorWritable suitable for input of Map-Reduce job.") .create(); Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile) .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).withOption(convert) .create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } Path input = new Path((String) cmdLine.getValue(inputFile)); Path output = new Path((String) cmdLine.getValue(outputFile), DIRECTORY_CONTAINING_CONVERTED_INPUT); if (cmdLine.hasOption(convert)) { jobArgs = new String[args.length - 1]; int index = 0; for (int i = 0; i < args.length; ++i) { if (args[i].equals("--convert")) { continue; } jobArgs[index++] = args[i]; if (args[i].equals("--input")) { args[i + 1] = output.toString(); InputDriver.runJob(input, output, "org.apache.mahout.math.RandomAccessSparseVector"); } if (args[i].equals("--output")) { args[i + 1] = (new Path((String) cmdLine.getValue(outputFile), DIRECTORY_CONTAINING_OUTPUT)) .toString(); } } } else { jobArgs = args; } return true; }
From source file:org.apache.mahout.regression.penalizedlinear.LinearCrossValidation.java
private boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFile = builder.withLongName("input").withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get training data (CSV or white-spaced TEXT file)").create(); Option outputFile = builder.withLongName("output").withRequired(true) .withArgument(argumentBuilder.withName("output").withMaximum(1).create()) .withDescription("where to get results").create(); Option dependent = builder.withLongName("dependent").withRequired(true) .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create()) .withDescription("the dependent features").create(); Option independent = builder.withLongName("independent").withRequired(true) .withArgument(argumentBuilder.withName("independent").create()) .withDescription("the independent features").create(); Option interaction = builder.withLongName("interaction").withRequired(true) .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create()) .withDescription(/*from w w w . j av a 2 s.co m*/ "the interactions of features, the format is: feature1:feature2 (identical features are OK)") .create(); Option bias = builder.withLongName("bias").withDescription("include a bias term").create(); Option lambda = builder.withLongName("lambda") .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create()) .withDescription("an increasing positive sequence of penalty coefficient, " + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.") .create(); Option alpha = builder.withLongName("alpha") .withArgument( argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create()) .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create(); Option numOfCV = builder.withLongName("numOfCV") .withArgument( argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create()) .withDescription("number of cross validation, the rule of thumb is 5 or 10").create(); Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile) .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias) .withOption(lambda).withOption(alpha).withOption(numOfCV).create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } parameter = new LinearCrossValidationParameter(); parameter.numOfCV = Integer.parseInt((String) cmdLine.getValue(numOfCV)); parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha)); parameter.intercept = cmdLine.hasOption(bias); parameter.dependent = (String) cmdLine.getValue(dependent); String independentString = ""; for (Object x : cmdLine.getValues(independent)) { independentString += x.toString() + ","; } parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0)); String interactionString = ""; for (Object x : cmdLine.getValues(interaction)) { interactionString += x.toString() + ","; } parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0)); if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0 || parameter.numOfCV < 1 || parameter.numOfCV > 20) { log.error( "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20"); return false; } input = (String) cmdLine.getValue(inputFile); output = (String) cmdLine.getValue(outputFile); return true; }
From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java
private boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFile = builder.withLongName("input").withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription("where to get training data (CSV or white-spaced TEXT file)").create(); Option outputFile = builder.withLongName("output").withRequired(true) .withArgument(argumentBuilder.withName("output").withMaximum(1).create()) .withDescription("where to get results").create(); Option dependent = builder.withLongName("dependent").withRequired(true) .withArgument(argumentBuilder.withName("dependent").withMinimum(1).withMaximum(1).create()) .withDescription("the dependent features").create(); Option independent = builder.withLongName("independent").withRequired(true) .withArgument(argumentBuilder.withName("independent").create()) .withDescription("the independent features").create(); Option interaction = builder.withLongName("interaction").withRequired(true) .withArgument(argumentBuilder.withName("interaction").withMinimum(0).create()) .withDescription(/*ww w .j a va 2s. co m*/ "the interactions of features, the format is: feature1:feature2 (identical features are OK)") .create(); Option bias = builder.withLongName("bias").withDescription("include a bias term").create(); Option lambda = builder.withLongName("lambda") .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create()) .withDescription("an increasing positive sequence of penalty coefficient, " + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.") .create(); Option alpha = builder.withLongName("alpha") .withArgument( argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create()) .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create(); Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile) .withOption(dependent).withOption(independent).withOption(interaction).withOption(bias) .withOption(lambda).withOption(alpha).create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } parameter = new LinearRegularizePathParameter(); parameter.numOfCV = 1; parameter.alpha = Float.parseFloat((String) cmdLine.getValue(alpha)); parameter.intercept = cmdLine.hasOption(bias); parameter.dependent = (String) cmdLine.getValue(dependent); String independentString = ""; for (Object x : cmdLine.getValues(independent)) { independentString += x.toString() + ","; } parameter.independent = independentString.substring(0, Math.max(independentString.length() - 1, 0)); String interactionString = ""; for (Object x : cmdLine.getValues(interaction)) { interactionString += x.toString() + ","; } parameter.interaction = interactionString.substring(0, Math.max(interactionString.length() - 1, 0)); if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0 || parameter.numOfCV < 1 || parameter.numOfCV > 20) { log.error( "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numofCV <= 20"); return false; } input = (String) cmdLine.getValue(inputFile); output = (String) cmdLine.getValue(outputFile); return true; }
From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java
private boolean parseArgs(String[] args) { DefaultOptionBuilder builder = new DefaultOptionBuilder(); Option help = builder.withLongName("help").withDescription("print this list").create(); ArgumentBuilder argumentBuilder = new ArgumentBuilder(); Option inputFile = builder.withLongName("input").withRequired(true) .withArgument(argumentBuilder.withName("input").withMaximum(1).create()) .withDescription(// w w w . j a v a 2 s .co m "where to get training data (Mahout sequence file of VectorWritable); in each line, the first element is response; rest are predictors.") .create(); Option outputFile = builder.withLongName("output").withRequired(true) .withArgument(argumentBuilder.withName("output").withMaximum(1).create()) .withDescription("where to get results").create(); Option lambda = builder.withLongName("lambda") .withArgument(argumentBuilder.withName("lambda").withDefault("0").withMinimum(1).create()) .withDescription("an increasing positive sequence of penalty coefficient, " + "with length n >= 0; if lambda is not specified, the sequence is chosen by algorithm.") .create(); Option alpha = builder.withLongName("alpha") .withArgument( argumentBuilder.withName("alpha").withDefault("1").withMinimum(1).withMaximum(1).create()) .withDescription("the elastic-net coefficient with default value 1 (LASSO)").create(); Option bias = builder.withLongName("bias").withDescription("include a bias term").create(); Option numOfCV = builder.withLongName("numOfCV") .withArgument( argumentBuilder.withName("numOfCV").withDefault("5").withMinimum(0).withMaximum(1).create()) .withDescription("number of cross validation, the rule of thumb is 5 or 10").create(); Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFile).withOption(outputFile) .withOption(lambda).withOption(alpha).withOption(bias).withOption(numOfCV).create(); Parser parser = new Parser(); parser.setHelpOption(help); parser.setHelpTrigger("--help"); parser.setGroup(normalArgs); parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130)); CommandLine cmdLine = parser.parseAndHelp(args); if (cmdLine == null) { return false; } parameter = new PenalizedLinearParameter(); parameter.setNumOfCV(Integer.parseInt((String) cmdLine.getValue(numOfCV))); parameter.setAlpha(Float.parseFloat((String) cmdLine.getValue(alpha))); parameter.setIntercept(cmdLine.hasOption(bias)); if (!processLambda(parameter, cmdLine, lambda) || parameter.alpha < 0.0 || parameter.alpha > 1.0 || parameter.numOfCV < 1 || parameter.numOfCV > 20) { log.error( "please make sure the lambda sequence is positive and increasing, and 0.0 <= alphaValue <= 1.0 and 1 <= numOfCV <= 20"); return false; } input = (String) cmdLine.getValue(inputFile); output = (String) cmdLine.getValue(outputFile); return true; }
From source file:org.apache.mahout.text.SparseVectorsFromSequenceFiles.java
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputDirOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("input dir containing the documents in sequence file format").withShortName("i") .create();/* www.j a v a2 s . c om*/ Option outputDirOpt = obuilder.withLongName("output").withRequired(true) .withArgument(abuilder.withName("output").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory").withShortName("o").create(); Option minSupportOpt = obuilder.withLongName("minSupport") .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create(); Option analyzerNameOpt = obuilder.withLongName("analyzerName") .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()) .withDescription("The class name of the analyzer").withShortName("a").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize") .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create(); Option weightOpt = obuilder.withLongName("weight").withRequired(false) .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create()) .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create(); Option minDFOpt = obuilder.withLongName("minDF").withRequired(false) .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()) .withDescription("The minimum document frequency. Default is 1").withShortName("md").create(); Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false) .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()) .withDescription( "The max percentage of docs for the DF. Can be used to remove really high frequency terms." + " Expressed as an integer between 0 and 100. Default is 99.") .withShortName("x").create(); Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false) .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR) .withShortName("ml").create(); Option numReduceTasksOpt = obuilder.withLongName("numReducers") .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr") .create(); Option powerOpt = obuilder.withLongName("norm").withRequired(false) .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create()) .withDescription( "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. " + "Must be greater or equal to 0. The default is not to normalize") .withShortName("n").create(); Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false) .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()) .withDescription("(Optional) The maximum size of ngrams to create" + " (2 = bigrams, 3 = trigrams, etc) Default Value:1") .withShortName("ng").create(); Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false) .withDescription( "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false") .withShortName("seq").create(); Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false) .withDescription("If set, overwrite the output directory").withShortName("ow").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create(); Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt) .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt) .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt) .withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput) .withOption(helpOpt).withOption(sequentialAccessVectorOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt)); Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt)); int chunkSize = 100; if (cmdLine.hasOption(chunkSizeOpt)) { chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); } int minSupport = 2; if (cmdLine.hasOption(minSupportOpt)) { String minSupportString = (String) cmdLine.getValue(minSupportOpt); minSupport = Integer.parseInt(minSupportString); } int maxNGramSize = 1; if (cmdLine.hasOption(maxNGramSizeOpt)) { try { maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString()); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (cmdLine.hasOption(overwriteOutput)) { HadoopUtil.overwriteOutput(outputDir); } float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; if (cmdLine.hasOption(minLLROpt)) { minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString()); } log.info("Minimum LLR value: {}", minLLRValue); int reduceTasks = 1; if (cmdLine.hasOption(numReduceTasksOpt)) { reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString()); } log.info("Number of reduce tasks: {}", reduceTasks); Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class; if (cmdLine.hasOption(analyzerNameOpt)) { String className = cmdLine.getValue(analyzerNameOpt).toString(); analyzerClass = (Class<? extends Analyzer>) Class.forName(className); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it analyzerClass.newInstance(); } boolean processIdf; if (cmdLine.hasOption(weightOpt)) { String wString = cmdLine.getValue(weightOpt).toString(); if (wString.equalsIgnoreCase("tf")) { processIdf = false; } else if (wString.equalsIgnoreCase("tfidf")) { processIdf = true; } else { throw new OptionException(weightOpt); } } else { processIdf = true; } int minDf = 1; if (cmdLine.hasOption(minDFOpt)) { minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString()); } int maxDFPercent = 99; if (cmdLine.hasOption(maxDFPercentOpt)) { maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()); } float norm = PartialVectorMerger.NO_NORMALIZING; if (cmdLine.hasOption(powerOpt)) { String power = cmdLine.getValue(powerOpt).toString(); if (power.equals("INF")) { norm = Float.POSITIVE_INFINITY; } else { norm = Float.parseFloat(power); } } HadoopUtil.overwriteOutput(outputDir); Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath); boolean sequentialAccessOutput = false; if (cmdLine.hasOption(sequentialAccessVectorOpt)) { sequentialAccessOutput = true; } DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput); if (processIdf) { TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), new Path(outputDir, TFIDFConverter.TFIDF_OUTPUT_FOLDER), chunkSize, minDf, maxDFPercent, norm, sequentialAccessOutput, reduceTasks); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.text.wikipedia.WikipediaXmlSplitter.java
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true) .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d") .create();// www .j av a 2s . co m Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true) .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory to place the splits in:\n" + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n" + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n" + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n" + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n") .withShortName("o").create(); Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false) .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 ID key").withShortName("i").create(); Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false) .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 secret key").withShortName("s").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true) .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The Size of the chunk, in megabytes").withShortName("c").create(); Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false) .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of chunks to create. If specified, program will only create a subset of the chunks") .withShortName("n").create(); Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt) .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine; try { cmdLine = parser.parse(args); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return; } Configuration conf = new Configuration(); String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt); String outputDirPath = (String) cmdLine.getValue(outputDirOpt); if (cmdLine.hasOption(s3IdOpt)) { String id = (String) cmdLine.getValue(s3IdOpt); conf.set("fs.s3n.awsAccessKeyId", id); conf.set("fs.s3.awsAccessKeyId", id); } if (cmdLine.hasOption(s3SecretOpt)) { String secret = (String) cmdLine.getValue(s3SecretOpt); conf.set("fs.s3n.awsSecretAccessKey", secret); conf.set("fs.s3.awsSecretAccessKey", secret); } // do not compute crc file when using local FS conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf); int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); int numChunks = Integer.MAX_VALUE; if (cmdLine.hasOption(numChunksOpt)) { numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt)); } String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ " + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n" + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n" + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n" + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n" + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n" + " <namespace key=\"1\">Talk</namespace>\n" + " <namespace key=\"2\">User</namespace>\n" + " <namespace key=\"3\">User talk</namespace>\n" + " <namespace key=\"4\">Wikipedia</namespace>\n" + " <namespace key=\"5\">Wikipedia talk</namespace>\n" + " <namespace key=\"6\">Image</namespace>\n" + " <namespace key=\"7\">Image talk</namespace>\n" + " <namespace key=\"8\">MediaWiki</namespace>\n" + " <namespace key=\"9\">MediaWiki talk</namespace>\n" + " <namespace key=\"10\">Template</namespace>\n" + " <namespace key=\"11\">Template talk</namespace>\n" + " <namespace key=\"12\">Help</namespace>\n" + " <namespace key=\"13\">Help talk</namespace>\n" + " <namespace key=\"14\">Category</namespace>\n" + " <namespace key=\"15\">Category talk</namespace>\n" + " <namespace key=\"100\">Portal</namespace>\n" + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n" + " </siteinfo>\n"; StringBuilder content = new StringBuilder(); content.append(header); NumberFormat decimalFormatter = new DecimalFormat("0000"); File dumpFile = new File(dumpFilePath); // If the specified path for the input file is incorrect, return immediately if (!dumpFile.exists()) { log.error("Input file path {} doesn't exist", dumpFilePath); return; } FileLineIterator it; if (dumpFilePath.endsWith(".bz2")) { // default compression format from http://download.wikimedia.org CompressionCodec codec = new BZip2Codec(); it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile))); } else { // assume the user has previously de-compressed the dump file it = new FileLineIterator(dumpFile); } int fileNumber = 0; while (it.hasNext()) { String thisLine = it.next(); if (thisLine.trim().startsWith("<page>")) { boolean end = false; while (!thisLine.trim().startsWith("</page>")) { content.append(thisLine).append('\n'); if (it.hasNext()) { thisLine = it.next(); } else { end = true; break; } } content.append(thisLine).append('\n'); if (content.length() > chunkSize || end) { content.append("</mediawiki>"); fileNumber++; String filename = outputDirPath + "/chunk-" + decimalFormatter.format(fileNumber) + ".xml"; BufferedWriter chunkWriter = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8")); try { chunkWriter.write(content.toString(), 0, content.length()); } finally { Closeables.close(chunkWriter, false); } if (fileNumber >= numChunks) { break; } content = new StringBuilder(); content.append(header); } } } }