List of usage examples for org.apache.commons.cli2 CommandLine getValue
Object getValue(final Option option) throws IllegalStateException;
From source file:org.apache.mahout.classifier.bayes.TrainClassifier.java
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create(); Option minDfOpt = obuilder.withLongName("minDf").withRequired(false) .withArgument(abuilder.withName("minDf").withMinimum(1).withMaximum(1).create()) .withDescription("Minimum Term Document Frequency: 1 ").withShortName("mf").create(); Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false) .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) .withDescription("Minimum Support (Term Frequency): 1 ").withShortName("ms").create(); Option alphaOpt = obuilder.withLongName("alpha").withRequired(false) .withArgument(abuilder.withName("a").withMinimum(1).withMaximum(1).create()) .withDescription("Smoothing parameter Default Value: 1.0").withShortName("a").create(); Option typeOpt = obuilder.withLongName("classifierType").withRequired(false) .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create()) .withDescription("Type of classifier: bayes|cbayes. Default: bayes").withShortName("type").create(); Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(false) .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create()) .withDescription("Location of model: hdfs. Default Value: hdfs").withShortName("source").create(); Option skipCleanupOpt = obuilder.withLongName("skipCleanup").withRequired(false) .withDescription("Skip cleanup of feature extraction output").withShortName("sc").create(); Option compressOpt = obuilder.withLongName("compress").withRequired(false) .withArgument(abuilder.withName("compress").withDefault("0").withMinimum(0).withMaximum(1).create()) .withDescription("True if the output should be compressed. Default is false").withShortName("comp") .create();//from w w w .ja v a 2 s . c om Option compressCodecOpt = obuilder.withLongName("codec").withRequired(false) .withArgument(abuilder.withName("codec").withDefault("org.apache.hadoop.io.compress.DefaultCodec") .withMinimum(0).withMaximum(1).create()) .withDescription("Compress codec Default Value: org.apache.hadoop.io.compress.DefaultCodec") .withShortName("co").create(); Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt) .withOption(inputDirOpt).withOption(outputOpt).withOption(typeOpt).withOption(dataSourceOpt) .withOption(alphaOpt).withOption(minDfOpt).withOption(minSupportOpt).withOption(skipCleanupOpt) .withOption(compressOpt).withOption(compressCodecOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String classifierType = (String) cmdLine.getValue(typeOpt); String dataSourceType = (String) cmdLine.getValue(dataSourceOpt); BayesParameters params = new BayesParameters(); // Setting all the default parameter values params.setGramSize(1); params.setMinDF(1); params.set("alpha_i", "1.0"); params.set("dataSource", "hdfs"); if (cmdLine.hasOption(gramSizeOpt)) { params.setGramSize(Integer.parseInt((String) cmdLine.getValue(gramSizeOpt))); } if (cmdLine.hasOption(minDfOpt)) { params.setMinDF(Integer.parseInt((String) cmdLine.getValue(minDfOpt))); } if (cmdLine.hasOption(minSupportOpt)) { params.setMinSupport(Integer.parseInt((String) cmdLine.getValue(minSupportOpt))); } if (cmdLine.hasOption(skipCleanupOpt)) { params.setSkipCleanup(true); } if (cmdLine.hasOption(alphaOpt)) { params.set("alpha_i", (String) cmdLine.getValue(alphaOpt)); } if (cmdLine.hasOption(dataSourceOpt)) { params.set("dataSource", dataSourceType); } if (cmdLine.hasOption(compressOpt) && cmdLine.getValue(compressOpt).toString().equals("1")) { params.set("compress", "true"); } else { params.set("compress", "false"); } if (cmdLine.hasOption(compressCodecOpt)) { params.set("codec", (String) cmdLine.getValue(compressCodecOpt)); } Path inputPath = new Path((String) cmdLine.getValue(inputDirOpt)); Path outputPath = new Path((String) cmdLine.getValue(outputOpt)); if ("cbayes".equalsIgnoreCase(classifierType)) { log.info("Training Complementary Bayes Classifier"); trainCNaiveBayes(inputPath, outputPath, params); } else { log.info("Training Bayes Classifier"); // setup the HDFS and copy the files there, then run the trainer trainNaiveBayes(inputPath, outputPath, params); } } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver.java
/** * Takes in two arguments:/* www . j a v a2s . c om*/ * <ol> * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li> * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a * {@link org.apache.hadoop.io.SequenceFile}</li> * </ol> */ public static void main(String[] args) throws IOException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dirInputPathOpt = DefaultOptionCreator.inputOption().create(); Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create(); Option categoriesOpt = obuilder.withLongName("categories").withRequired(true) .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create()) .withDescription("Location of the categories file. One entry per line. " + "Will be used to make a string match in Wikipedia Category field") .withShortName("c").create(); Option exactMatchOpt = obuilder.withLongName("exactMatch") .withDescription("If set, then the category name must exactly match the " + "entry in the categories file. Default is false") .withShortName("e").create(); Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false) .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The analyzer to use, must have a no argument constructor").withShortName("a") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt) .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); try { CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String inputPath = (String) cmdLine.getValue(dirInputPathOpt); String outputPath = (String) cmdLine.getValue(dirOutputPathOpt); String catFile = (String) cmdLine.getValue(categoriesOpt); Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class; if (cmdLine.hasOption(analyzerOpt)) { String className = cmdLine.getValue(analyzerOpt).toString(); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it ClassUtils.instantiateAs(analyzerClass, Analyzer.class); } runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (ClassNotFoundException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }
From source file:org.apache.mahout.classifier.bayes.WikipediaXmlSplitter.java
public static void main(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true) .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()) .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d") .create();/*from w w w . j a va 2 s .c om*/ Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true) .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()) .withDescription("The output directory to place the splits in:\n" + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n" + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n" + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n" + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n") .withShortName("o").create(); Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false) .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 ID key").withShortName("i").create(); Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false) .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create()) .withDescription("Amazon S3 secret key").withShortName("s").create(); Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true) .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()) .withDescription("The Size of the chunk, in megabytes").withShortName("c").create(); Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false) .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()) .withDescription( "The maximum number of chunks to create. If specified, program will only create a subset of the chunks") .withShortName("n").create(); Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt) .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine; try { cmdLine = parser.parse(args); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return; } Configuration conf = new Configuration(); String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt); String outputDirPath = (String) cmdLine.getValue(outputDirOpt); if (cmdLine.hasOption(s3IdOpt)) { String id = (String) cmdLine.getValue(s3IdOpt); conf.set("fs.s3n.awsAccessKeyId", id); conf.set("fs.s3.awsAccessKeyId", id); } if (cmdLine.hasOption(s3SecretOpt)) { String secret = (String) cmdLine.getValue(s3SecretOpt); conf.set("fs.s3n.awsSecretAccessKey", secret); conf.set("fs.s3.awsSecretAccessKey", secret); } // do not compute crc file when using local FS conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem"); FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf); int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt)); int numChunks = Integer.MAX_VALUE; if (cmdLine.hasOption(numChunksOpt)) { numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt)); } String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" " + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" " + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ " + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n" + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n" + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n" + " <generator>MediaWiki 1.13alpha</generator>\n" + " <case>first-letter</case>\n" + " <namespaces>\n" + " <namespace key=\"-2\">Media</namespace>\n" + " <namespace key=\"-1\">Special</namespace>\n" + " <namespace key=\"0\" />\n" + " <namespace key=\"1\">Talk</namespace>\n" + " <namespace key=\"2\">User</namespace>\n" + " <namespace key=\"3\">User talk</namespace>\n" + " <namespace key=\"4\">Wikipedia</namespace>\n" + " <namespace key=\"5\">Wikipedia talk</namespace>\n" + " <namespace key=\"6\">Image</namespace>\n" + " <namespace key=\"7\">Image talk</namespace>\n" + " <namespace key=\"8\">MediaWiki</namespace>\n" + " <namespace key=\"9\">MediaWiki talk</namespace>\n" + " <namespace key=\"10\">Template</namespace>\n" + " <namespace key=\"11\">Template talk</namespace>\n" + " <namespace key=\"12\">Help</namespace>\n" + " <namespace key=\"13\">Help talk</namespace>\n" + " <namespace key=\"14\">Category</namespace>\n" + " <namespace key=\"15\">Category talk</namespace>\n" + " <namespace key=\"100\">Portal</namespace>\n" + " <namespace key=\"101\">Portal talk</namespace>\n" + " </namespaces>\n" + " </siteinfo>\n"; StringBuilder content = new StringBuilder(); content.append(header); NumberFormat decimalFormatter = new DecimalFormat("0000"); File dumpFile = new File(dumpFilePath); FileLineIterator it; if (dumpFilePath.endsWith(".bz2")) { // default compression format from http://download.wikimedia.org CompressionCodec codec = new BZip2Codec(); it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile))); } else { // assume the user has previously de-compressed the dump file it = new FileLineIterator(dumpFile); } int filenumber = 0; while (it.hasNext()) { String thisLine = it.next(); if (thisLine.trim().startsWith("<page>")) { boolean end = false; while (!thisLine.trim().startsWith("</page>")) { content.append(thisLine).append('\n'); if (it.hasNext()) { thisLine = it.next(); } else { end = true; break; } } content.append(thisLine).append('\n'); if (content.length() > chunkSize || end) { content.append("</mediawiki>"); filenumber++; String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml"; BufferedWriter chunkWriter = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8")); try { chunkWriter.write(content.toString(), 0, content.length()); } finally { Closeables.closeQuietly(chunkWriter); } if (filenumber >= numChunks) { break; } content = new StringBuilder(); content.append(header); } } } }
From source file:org.apache.mahout.classifier.BayesFileFormatter.java
/** * Run the FileFormatter/*from w w w . j av a 2 s . co m*/ * * @param args * The input args. Run with -h to see the help * @throws ClassNotFoundException * if the Analyzer can't be found * @throws IllegalAccessException * if the Analyzer can't be constructed * @throws InstantiationException * if the Analyzer can't be constructed * @throws IOException * if the files can't be dealt with properly */ public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option labelOpt = obuilder.withLongName("label").withRequired(true) .withArgument(abuilder.withName("label").withMinimum(1).withMaximum(1).create()) .withDescription("The label of the file").withShortName("l").create(); Option analyzerOpt = obuilder.withLongName("analyzer") .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The fully qualified class name of the analyzer to use. " + "Must have a no-arg constructor. Default is the StandardAnalyzer") .withShortName("a").create(); Option charsetOpt = obuilder.withLongName("charset") .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create()) .withDescription("The character encoding of the input file").withShortName("c").create(); Option collapseOpt = obuilder.withLongName("collapse").withRequired(true) .withArgument(abuilder.withName("collapse").withMinimum(1).withMaximum(1).create()) .withDescription("Collapse a whole directory to a single file, one doc per line").withShortName("p") .create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt) .withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { return; } File input = new File((String) cmdLine.getValue(inputOpt)); File output = new File((String) cmdLine.getValue(outputOpt)); String label = (String) cmdLine.getValue(labelOpt); Analyzer analyzer; if (cmdLine.hasOption(analyzerOpt)) { analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class); } else { analyzer = new StandardAnalyzer(Version.LUCENE_31); } Charset charset = Charsets.UTF_8; if (cmdLine.hasOption(charsetOpt)) { charset = Charset.forName((String) cmdLine.getValue(charsetOpt)); } boolean collapse = cmdLine.hasOption(collapseOpt); if (collapse) { collapse(label, analyzer, input, charset, output); } else { format(label, analyzer, input, charset, output); } } catch (OptionException e) { log.error("Exception", e); } }
From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.BuildModel.java
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Data path").create(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("The path of the file descriptor of the dataset").create(); Option timeOpt = obuilder.withLongName("time").withShortName("tm").withRequired(false) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Time path").create(); Option outputOpt = obuilder.withLongName("output").withShortName("o").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Output path, will contain the Decision Forest").create(); Option labelsOpt = obuilder.withLongName("labels").withShortName("l").withRequired(true) .withArgument(abuilder.withName("labels").withMinimum(1).withMaximum(1).create()) .withDescription("Number of Labels").create(); Option combinationTypeOpt = obuilder.withLongName("combinationType").withShortName("t").withRequired(true) .withArgument(abuilder.withName("combinationType").withMinimum(1).withMaximum(1).create()) .withDescription("T-norm for the computation of the compatibility degree").create(); Option rule_weightOpt = obuilder.withLongName("rule_weight").withShortName("r").withRequired(true) .withArgument(abuilder.withName("rule_weight").withMinimum(1).withMaximum(1).create()) .withDescription("Rule Weight").create(); Option fuzzy_r_mOpt = obuilder.withLongName("fuzzy_r_m").withShortName("f").withRequired(true) .withArgument(abuilder.withName("fuzzy_r_m").withMinimum(1).withMaximum(1).create()) .withDescription("Fuzzy Reasoning Method").create(); Option helpOpt = obuilder.withLongName("help").withShortName("h").withDescription("Print out help") .create();// ww w. ja va 2 s .c o m Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(timeOpt) .withOption(outputOpt).withOption(labelsOpt).withOption(combinationTypeOpt) .withOption(rule_weightOpt).withOption(fuzzy_r_mOpt).withOption(helpOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return -1; } dataName = cmdLine.getValue(dataOpt).toString(); String datasetName = cmdLine.getValue(datasetOpt).toString(); String outputName = cmdLine.getValue(outputOpt).toString(); nLabels = Integer.parseInt(cmdLine.getValue(labelsOpt).toString()); String combinationType_aux = cmdLine.getValue(combinationTypeOpt).toString(); String ruleWeight_aux = cmdLine.getValue(rule_weightOpt).toString(); String inferenceType_aux = cmdLine.getValue(fuzzy_r_mOpt).toString(); if (cmdLine.hasOption(timeOpt)) { buildTimeIsStored = true; timeName = cmdLine.getValue(timeOpt).toString(); } if (log.isDebugEnabled()) { log.debug("data : {}", dataName); log.debug("dataset : {}", datasetName); log.debug("output : {}", outputName); log.debug("labels : {}", nLabels); log.debug("t_norm : {}", combinationType_aux); log.debug("rule_weight : {}", ruleWeight_aux); log.debug("fuzzy_r_m : {}", inferenceType_aux); log.debug("time : {}", timeName); } dataPath = new Path(dataName); datasetPath = new Path(datasetName); outputPath = new Path(outputName); if (buildTimeIsStored) timePath = new Path(timeName); combinationType = PRODUCT; if (combinationType_aux.compareToIgnoreCase("minimum") == 0) { combinationType = MINIMUM; } ruleWeight = PCF_IV; if (ruleWeight_aux.compareToIgnoreCase("Certainty_Factor") == 0) { ruleWeight = CF; } else if (ruleWeight_aux.compareToIgnoreCase("Average_Penalized_Certainty_Factor") == 0) { ruleWeight = PCF_II; } else if (ruleWeight_aux.compareToIgnoreCase("No_Weights") == 0) { ruleWeight = NO_RW; } inferenceType = WINNING_RULE; if (inferenceType_aux.compareToIgnoreCase("Additive_Combination") == 0) { inferenceType = ADDITIVE_COMBINATION; } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); return -1; } buildModel(); return 0; }
From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.TestModel.java
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { // TODO Auto-generated method stub DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = DefaultOptionCreator.inputOption().create(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("Dataset path").create(); Option modelOpt = obuilder.withLongName("model").withShortName("m").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Path to the Model").create(); Option outputOpt = DefaultOptionCreator.outputOption().create(); Option helpOpt = DefaultOptionCreator.helpOption(); Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(datasetOpt).withOption(modelOpt) .withOption(outputOpt).withOption(helpOpt).create(); try {/*from w w w. j a v a 2s .com*/ Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return -1; } dataName = cmdLine.getValue(inputOpt).toString(); String datasetName = cmdLine.getValue(datasetOpt).toString(); String modelName = cmdLine.getValue(modelOpt).toString(); String outputName = cmdLine.hasOption(outputOpt) ? cmdLine.getValue(outputOpt).toString() : null; if (log.isDebugEnabled()) { log.debug("inout : {}", dataName); log.debug("dataset : {}", datasetName); log.debug("model : {}", modelName); log.debug("output : {}", outputName); } dataPath = new Path(dataName); datasetPath = new Path(datasetName); modelPath = new Path(modelName); if (outputName != null) { outputPath = new Path(outputName); } } catch (OptionException e) { log.warn(e.toString(), e); CommandLineUtil.printHelp(group); return -1; } time = System.currentTimeMillis(); testModel(); time = System.currentTimeMillis() - time; writeToFileClassifyTime(Chi_RWCSUtils.elapsedTime(time)); return 0; }
From source file:org.apache.mahout.classifier.Classify.java
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option pathOpt = obuilder.withLongName("path").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("The local file system path").withShortName("m").create(); Option classifyOpt = obuilder.withLongName("classify").withRequired(true) .withArgument(abuilder.withName("classify").withMinimum(1).withMaximum(1).create()) .withDescription("The doc to classify").withShortName("").create(); Option encodingOpt = obuilder.withLongName("encoding").withRequired(true) .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()) .withDescription("The file encoding. Default: UTF-8").withShortName("e").create(); Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(true) .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()) .withDescription("The Analyzer to use").withShortName("a").create(); Option defaultCatOpt = obuilder.withLongName("defaultCat").withRequired(true) .withArgument(abuilder.withName("defaultCat").withMinimum(1).withMaximum(1).create()) .withDescription("The default category").withShortName("d").create(); Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(true) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Size of the n-gram").withShortName("ng").create(); Option typeOpt = obuilder.withLongName("classifierType").withRequired(true) .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create()) .withDescription("Type of classifier").withShortName("type").create(); Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(true) .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create()) .withDescription("Location of model: hdfs").withShortName("source").create(); Group options = gbuilder.withName("Options").withOption(pathOpt).withOption(classifyOpt) .withOption(encodingOpt).withOption(analyzerOpt).withOption(defaultCatOpt).withOption(gramSizeOpt) .withOption(typeOpt).withOption(dataSourceOpt).create(); Parser parser = new Parser(); parser.setGroup(options);/* ww w . ja va 2 s .c o m*/ CommandLine cmdLine = parser.parse(args); int gramSize = 1; if (cmdLine.hasOption(gramSizeOpt)) { gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt)); } BayesParameters params = new BayesParameters(); params.setGramSize(gramSize); String modelBasePath = (String) cmdLine.getValue(pathOpt); params.setBasePath(modelBasePath); log.info("Loading model from: {}", params.print()); Algorithm algorithm; Datastore datastore; String classifierType = (String) cmdLine.getValue(typeOpt); String dataSource = (String) cmdLine.getValue(dataSourceOpt); if ("hdfs".equals(dataSource)) { if ("bayes".equalsIgnoreCase(classifierType)) { log.info("Using Bayes Classifier"); algorithm = new BayesAlgorithm(); datastore = new InMemoryBayesDatastore(params); } else if ("cbayes".equalsIgnoreCase(classifierType)) { log.info("Using Complementary Bayes Classifier"); algorithm = new CBayesAlgorithm(); datastore = new InMemoryBayesDatastore(params); } else { throw new IllegalArgumentException("Unrecognized classifier type: " + classifierType); } } else { throw new IllegalArgumentException("Unrecognized dataSource type: " + dataSource); } ClassifierContext classifier = new ClassifierContext(algorithm, datastore); classifier.initialize(); String defaultCat = "unknown"; if (cmdLine.hasOption(defaultCatOpt)) { defaultCat = (String) cmdLine.getValue(defaultCatOpt); } File docPath = new File((String) cmdLine.getValue(classifyOpt)); String encoding = "UTF-8"; if (cmdLine.hasOption(encodingOpt)) { encoding = (String) cmdLine.getValue(encodingOpt); } Analyzer analyzer = null; if (cmdLine.hasOption(analyzerOpt)) { analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class); } if (analyzer == null) { analyzer = new StandardAnalyzer(Version.LUCENE_31); } log.info("Converting input document to proper format"); String[] document = BayesFileFormatter.readerToDocument(analyzer, Files.newReader(docPath, Charset.forName(encoding))); StringBuilder line = new StringBuilder(); for (String token : document) { line.append(token).append(' '); } List<String> doc = new NGrams(line.toString(), gramSize).generateNGramsWithoutLabel(); log.info("Done converting"); log.info("Classifying document: {}", docPath); ClassifierResult category = classifier.classifyDocument(doc.toArray(new String[doc.size()]), defaultCat); log.info("Category for {} is {}", docPath, category); }
From source file:org.apache.mahout.classifier.df.BreimanExample.java
@Override public int run(String[] args) throws IOException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Data path").create(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("Dataset path").create(); Option nbtreesOpt = obuilder.withLongName("nbtrees").withShortName("t").withRequired(true) .withArgument(abuilder.withName("nbtrees").withMinimum(1).withMaximum(1).create()) .withDescription("Number of trees to grow, each iteration").create(); Option nbItersOpt = obuilder.withLongName("iterations").withShortName("i").withRequired(true) .withArgument(abuilder.withName("numIterations").withMinimum(1).withMaximum(1).create()) .withDescription("Number of times to repeat the test").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create();/*from w ww.j a va2s . c o m*/ Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(nbItersOpt) .withOption(nbtreesOpt).withOption(helpOpt).create(); Path dataPath; Path datasetPath; int nbTrees; int nbIterations; try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return -1; } String dataName = cmdLine.getValue(dataOpt).toString(); String datasetName = cmdLine.getValue(datasetOpt).toString(); nbTrees = Integer.parseInt(cmdLine.getValue(nbtreesOpt).toString()); nbIterations = Integer.parseInt(cmdLine.getValue(nbItersOpt).toString()); dataPath = new Path(dataName); datasetPath = new Path(datasetName); } catch (OptionException e) { log.error("Error while parsing options", e); CommandLineUtil.printHelp(group); return -1; } // load the data FileSystem fs = dataPath.getFileSystem(new Configuration()); Dataset dataset = Dataset.load(getConf(), datasetPath); Data data = DataLoader.loadData(dataset, fs, dataPath); // take m to be the first integer less than log2(M) + 1, where M is the // number of inputs int m = (int) Math.floor(FastMath.log(2.0, data.getDataset().nbAttributes()) + 1); Random rng = RandomUtils.getRandom(); for (int iteration = 0; iteration < nbIterations; iteration++) { log.info("Iteration {}", iteration); runIteration(rng, data, m, nbTrees); } log.info("********************************************"); log.info("Random Input Test Error : {}", sumTestErrM / nbIterations); log.info("Single Input Test Error : {}", sumTestErrOne / nbIterations); log.info("Mean Random Input Time : {}", DFUtils.elapsedTime(sumTimeM / nbIterations)); log.info("Mean Single Input Time : {}", DFUtils.elapsedTime(sumTimeOne / nbIterations)); log.info("Mean Random Input Num Nodes : {}", numNodesM / nbIterations); log.info("Mean Single Input Num Nodes : {}", numNodesOne / nbIterations); return 0; }
From source file:org.apache.mahout.classifier.df.mapreduce.Resampling.java
public int run(String[] args) throws Exception, ClassNotFoundException, InterruptedException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Data path").create(); Option dataPreprocessingOpt = obuilder.withLongName("dataPreprocessing").withShortName("dp") .withRequired(true).withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Data Preprocessing path").create(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("Dataset path").create(); Option timeOpt = obuilder.withLongName("time").withShortName("tm").withRequired(false) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Time path").create(); Option helpOpt = obuilder.withLongName("help").withShortName("h").withDescription("Print out help") .create();/* w w w .j a va 2 s. c om*/ Option resamplingOpt = obuilder.withLongName("resampling").withShortName("rs").withRequired(true) .withArgument(abuilder.withName("resampling").withMinimum(1).withMaximum(1).create()) .withDescription( "The resampling technique (oversampling (overs), undersampling (unders) or SMOTE (smote))") .create(); Option nbpartitionsOpt = obuilder.withLongName("nbpartitions").withShortName("p").withRequired(true) .withArgument(abuilder.withName("nbpartitions").withMinimum(1).withMaximum(1).create()) .withDescription("Number of partitions").create(); Option nposOpt = obuilder.withLongName("npos").withShortName("npos").withRequired(true) .withArgument(abuilder.withName("npos").withMinimum(1).withMaximum(1).create()) .withDescription("Number of instances of the positive class").create(); Option nnegOpt = obuilder.withLongName("nneg").withShortName("nneg").withRequired(true) .withArgument(abuilder.withName("nneg").withMinimum(1).withMaximum(1).create()) .withDescription("Number of instances of the negative class").create(); Option negclassOpt = obuilder.withLongName("negclass").withShortName("negclass").withRequired(true) .withArgument(abuilder.withName("negclass").withMinimum(1).withMaximum(1).create()) .withDescription("Name of the negative class").create(); Option posclassOpt = obuilder.withLongName("posclass").withShortName("posclass").withRequired(true) .withArgument(abuilder.withName("posclass").withMinimum(1).withMaximum(1).create()) .withDescription("Name of the positive class").create(); Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(timeOpt) .withOption(helpOpt).withOption(resamplingOpt).withOption(dataPreprocessingOpt) .withOption(nbpartitionsOpt).withOption(nposOpt).withOption(nnegOpt).withOption(negclassOpt) .withOption(posclassOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return -1; } dataName = cmdLine.getValue(dataOpt).toString(); String datasetName = cmdLine.getValue(datasetOpt).toString(); dataPreprocessing = cmdLine.getValue(dataPreprocessingOpt).toString(); String resampling = cmdLine.getValue(resamplingOpt).toString(); partitions = Integer.parseInt(cmdLine.getValue(nbpartitionsOpt).toString()); npos = Integer.parseInt(cmdLine.getValue(nposOpt).toString()); nneg = Integer.parseInt(cmdLine.getValue(nnegOpt).toString()); negclass = cmdLine.getValue(negclassOpt).toString(); posclass = cmdLine.getValue(posclassOpt).toString(); if (resampling.equalsIgnoreCase("overs")) { withOversampling = true; } else if (resampling.equalsIgnoreCase("unders")) { withUndersampling = true; } else if (resampling.equalsIgnoreCase("smote")) { withSmote = true; } if (cmdLine.hasOption(timeOpt)) { preprocessingTimeIsStored = true; timeName = cmdLine.getValue(timeOpt).toString(); } if (log.isDebugEnabled()) { log.debug("data : {}", dataName); log.debug("dataset : {}", datasetName); log.debug("time : {}", timeName); log.debug("Oversampling : {}", withOversampling); log.debug("Undersampling : {}", withUndersampling); log.debug("SMOTE : {}", withSmote); } dataPath = new Path(dataName); datasetPath = new Path(datasetName); dataPreprocessingPath = new Path(dataPreprocessing); if (preprocessingTimeIsStored) timePath = new Path(timeName); } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); return -1; } if (withOversampling) { overSampling(); } else if (withUndersampling) { underSampling(); } else if (withSmote) { smote(); } return 0; }
From source file:org.apache.mahout.classifier.df.tools.ForestVisualizer.java
public static void main(String[] args) { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true) .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create()) .withDescription("Dataset path").create(); Option modelOpt = obuilder.withLongName("model").withShortName("m").withRequired(true) .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create()) .withDescription("Path to the Decision Forest").create(); Option attrNamesOpt = obuilder.withLongName("names").withShortName("n").withRequired(false) .withArgument(abuilder.withName("names").withMinimum(1).create()) .withDescription("Optional, Attribute names").create(); Option helpOpt = obuilder.withLongName("help").withShortName("h").withDescription("Print out help") .create();/* w w w .j a v a 2s. c o m*/ Group group = gbuilder.withName("Options").withOption(datasetOpt).withOption(modelOpt) .withOption(attrNamesOpt).withOption(helpOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption("help")) { CommandLineUtil.printHelp(group); return; } String datasetName = cmdLine.getValue(datasetOpt).toString(); String modelName = cmdLine.getValue(modelOpt).toString(); String[] attrNames = null; if (cmdLine.hasOption(attrNamesOpt)) { Collection<String> names = (Collection<String>) cmdLine.getValues(attrNamesOpt); if (!names.isEmpty()) { attrNames = new String[names.size()]; names.toArray(attrNames); } } print(modelName, datasetName, attrNames); } catch (Exception e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } }