Example usage for org.apache.commons.cli2 CommandLine hasOption

List of usage examples for org.apache.commons.cli2 CommandLine hasOption

Introduction

In this page you can find the example usage for org.apache.commons.cli2 CommandLine hasOption.

Prototype

boolean hasOption(final Option option);

Source Link

Document

Detects the presence of an option in this CommandLine.

Usage

From source file:org.apache.mahout.benchmark.VectorBenchmarks.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option vectorSizeOpt = obuilder.withLongName("vectorSize").withRequired(false)
            .withArgument(abuilder.withName("vs").withDefault(1000000).create())
            .withDescription("Cardinality of the vector. Default: 1000000").withShortName("vs").create();
    Option numNonZeroOpt = obuilder.withLongName("numNonZero").withRequired(false)
            .withArgument(abuilder.withName("nz").withDefault(1000).create())
            .withDescription("Size of the vector. Default: 1000").withShortName("nz").create();
    Option numVectorsOpt = obuilder.withLongName("numVectors").withRequired(false)
            .withArgument(abuilder.withName("nv").withDefault(25).create())
            .withDescription("Number of Vectors to create. Default: 25").withShortName("nv").create();
    Option numClustersOpt = obuilder.withLongName("numClusters").withRequired(false)
            .withArgument(abuilder.withName("nc").withDefault(0).create())
            .withDescription(/*from  www.  j av a  2 s.  c  o m*/
                    "Number of clusters to create. Set to non zero to run cluster benchmark. Default: 0")
            .withShortName("nc").create();
    Option numOpsOpt = obuilder.withLongName("numOps").withRequired(false)
            .withArgument(abuilder.withName("numOps").withDefault(10).create())
            .withDescription("Number of operations to do per timer. "
                    + "E.g In distance measure, the distance is calculated numOps times"
                    + " and the total time is measured. Default: 10")
            .withShortName("no").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(vectorSizeOpt).withOption(numNonZeroOpt)
            .withOption(numVectorsOpt).withOption(numOpsOpt).withOption(numClustersOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelpWithGenericOptions(group);
            return;
        }

        int cardinality = 1000000;
        if (cmdLine.hasOption(vectorSizeOpt)) {
            cardinality = Integer.parseInt((String) cmdLine.getValue(vectorSizeOpt));

        }

        int numClusters = 0;
        if (cmdLine.hasOption(numClustersOpt)) {
            numClusters = Integer.parseInt((String) cmdLine.getValue(numClustersOpt));
        }

        int numNonZero = 1000;
        if (cmdLine.hasOption(numNonZeroOpt)) {
            numNonZero = Integer.parseInt((String) cmdLine.getValue(numNonZeroOpt));
        }

        int numVectors = 25;
        if (cmdLine.hasOption(numVectorsOpt)) {
            numVectors = Integer.parseInt((String) cmdLine.getValue(numVectorsOpt));

        }

        int numOps = 10;
        if (cmdLine.hasOption(numOpsOpt)) {
            numOps = Integer.parseInt((String) cmdLine.getValue(numOpsOpt));

        }
        VectorBenchmarks mark = new VectorBenchmarks(cardinality, numNonZero, numVectors, numClusters, numOps);
        runBenchmark(mark);

        // log.info("\n{}", mark);
        log.info("\n{}", mark.asCsvString());
    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.cf.taste.example.TasteOptionParser.java

/**
 * Parse the given command line arguments.
 * @param args the arguments as given to the application.
 * @return the input file if a file was given on the command line, null otherwise.
 *//*from   w ww  .j  a v  a  2  s . co m*/
public static File getRatings(String[] args) throws OptionException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(false).withShortName("i")
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The Path for input data directory.").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(helpOpt).create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine = parser.parse(args);

    if (cmdLine.hasOption(helpOpt)) {
        CommandLineUtil.printHelp(group);
        return null;
    }

    return cmdLine.hasOption(inputOpt) ? new File(cmdLine.getValue(inputOpt).toString()) : null;
}

From source file:org.apache.mahout.classifier.bayes.mapreduce.common.JobExecutor.java

/**
 * Execute a bayes classification job. Input and output path are parsed from the input parameters.
 * /*  w ww. j  a  v a 2  s  . c o m*/
 * @param args
 *          input parameters.
 * @param job
 *          the job to execute.
 * @throws Exception
 *           any exception thrown at job execution.
 * */
public static void execute(String[] args, BayesJob job) throws IOException {
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();
    Option outputOpt = DefaultOptionCreator.outputOption().create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        Path input = new Path(cmdLine.getValue(inputOpt).toString());
        Path output = new Path(cmdLine.getValue(outputOpt).toString());

        BayesParameters bayesParams = new BayesParameters();
        bayesParams.setGramSize(1);
        job.runJob(input, output, bayesParams);
    } catch (OptionException e) {
        log.error(e.getMessage());
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.PrepareTwentyNewsgroups.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Option parentOpt = obuilder.withLongName("parent").withRequired(true)
            .withArgument(abuilder.withName("parent").withMinimum(1).withMaximum(1).create())
            .withDescription("Parent dir containing the newsgroups").withShortName("p").create();

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory").withShortName("o").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName").withRequired(true)
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option charsetOpt = obuilder.withLongName("charset").withRequired(true)
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The name of the character encoding of the input files").withShortName("c")
            .create();/*from  ww w  .j  a  v a2  s. co m*/

    Group group = gbuilder.withName("Options").withOption(analyzerNameOpt).withOption(charsetOpt)
            .withOption(outputDirOpt).withOption(parentOpt).withOption(helpOpt).create();
    try {

        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File parentDir = new File((String) cmdLine.getValue(parentOpt));
        File outputDir = new File((String) cmdLine.getValue(outputDirOpt));
        String analyzerName = (String) cmdLine.getValue(analyzerNameOpt);
        Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
        Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class);
        // parent dir contains dir by category
        if (!parentDir.exists()) {
            throw new FileNotFoundException("Can't find input directory " + parentDir);
        }
        File[] categoryDirs = parentDir.listFiles();
        for (File dir : categoryDirs) {
            if (dir.isDirectory()) {
                if (!outputDir.exists() && !outputDir.mkdirs()) {
                    throw new IllegalStateException("Can't create output directory");
                }

                File outputFile = new File(outputDir, dir.getName() + ".txt");
                BayesFileFormatter.collapse(dir.getName(), analyzer, dir, charset, outputFile);
            }
        }
    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.TestClassifier.java

public static void main(String[] args) throws IOException, InvalidDatastoreException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option pathOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("The path on HDFS as defined by the -source parameter").withShortName("m")
            .create();//from  w ww. j  av a2 s.co  m

    Option dirOpt = obuilder.withLongName("testDir").withRequired(true)
            .withArgument(abuilder.withName("testDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The directory where test documents resides in").withShortName("d").create();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option encodingOpt = obuilder.withLongName("encoding")
            .withArgument(abuilder.withName("encoding").withMinimum(1).withMaximum(1).create())
            .withDescription("The file encoding.  Defaults to UTF-8").withShortName("e").create();

    Option defaultCatOpt = obuilder.withLongName("defaultCat")
            .withArgument(abuilder.withName("defaultCat").withMinimum(1).withMaximum(1).create())
            .withDescription("The default category Default Value: unknown").withShortName("default").create();

    Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Size of the n-gram. Default Value: 1").withShortName("ng").create();

    Option alphaOpt = obuilder.withLongName("alpha").withRequired(false)
            .withArgument(abuilder.withName("a").withMinimum(1).withMaximum(1).create())
            .withDescription("Smoothing parameter Default Value: 1.0").withShortName("a").create();

    Option verboseOutputOpt = obuilder.withLongName("verbose").withRequired(false)
            .withDescription("Output which values were correctly and incorrectly classified").withShortName("v")
            .create();

    Option typeOpt = obuilder.withLongName("classifierType").withRequired(false)
            .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create())
            .withDescription("Type of classifier: bayes|cbayes. Default Value: bayes").withShortName("type")
            .create();

    Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(false)
            .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of model: hdfs").withShortName("source").create();

    Option methodOpt = obuilder.withLongName("method").withRequired(false)
            .withArgument(abuilder.withName("method").withMinimum(1).withMaximum(1).create())
            .withDescription("Method of Classification: sequential|mapreduce. Default Value: mapreduce")
            .withShortName("method").create();

    Option confusionMatrixOpt = obuilder.withLongName("confusionMatrix").withRequired(false)
            .withArgument(abuilder.withName("confusionMatrix").withMinimum(1).withMaximum(1).create())
            .withDescription("Export ConfusionMatrix as SequenceFile").withShortName("cm").create();

    Group group = gbuilder.withName("Options").withOption(defaultCatOpt).withOption(dirOpt)
            .withOption(encodingOpt).withOption(gramSizeOpt).withOption(pathOpt).withOption(typeOpt)
            .withOption(dataSourceOpt).withOption(helpOpt).withOption(methodOpt).withOption(verboseOutputOpt)
            .withOption(alphaOpt).withOption(confusionMatrixOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        BayesParameters params = new BayesParameters();
        // Setting all default values
        int gramSize = 1;

        String modelBasePath = (String) cmdLine.getValue(pathOpt);

        if (cmdLine.hasOption(gramSizeOpt)) {
            gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt));

        }

        String classifierType = "bayes";
        if (cmdLine.hasOption(typeOpt)) {
            classifierType = (String) cmdLine.getValue(typeOpt);
        }

        String dataSource = "hdfs";
        if (cmdLine.hasOption(dataSourceOpt)) {
            dataSource = (String) cmdLine.getValue(dataSourceOpt);
        }

        String defaultCat = "unknown";
        if (cmdLine.hasOption(defaultCatOpt)) {
            defaultCat = (String) cmdLine.getValue(defaultCatOpt);
        }

        String encoding = "UTF-8";
        if (cmdLine.hasOption(encodingOpt)) {
            encoding = (String) cmdLine.getValue(encodingOpt);
        }

        String alphaI = "1.0";
        if (cmdLine.hasOption(alphaOpt)) {
            alphaI = (String) cmdLine.getValue(alphaOpt);
        }

        boolean verbose = cmdLine.hasOption(verboseOutputOpt);

        String testDirPath = (String) cmdLine.getValue(dirOpt);

        String classificationMethod = "mapreduce";
        if (cmdLine.hasOption(methodOpt)) {
            classificationMethod = (String) cmdLine.getValue(methodOpt);
        }

        String confusionMatrixFile = null;
        if (cmdLine.hasOption(confusionMatrixOpt)) {
            confusionMatrixFile = (String) cmdLine.getValue(confusionMatrixOpt);
        }

        params.setGramSize(gramSize);
        params.set("verbose", Boolean.toString(verbose));
        params.setBasePath(modelBasePath);
        params.set("classifierType", classifierType);
        params.set("dataSource", dataSource);
        params.set("defaultCat", defaultCat);
        params.set("encoding", encoding);
        params.set("alpha_i", alphaI);
        params.set("testDirPath", testDirPath);
        params.set("confusionMatrix", confusionMatrixFile);

        if ("sequential".equalsIgnoreCase(classificationMethod)) {
            classifySequential(params);
        } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) {
            classifyParallel(params);
        }
    } catch (OptionException e) {
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.TrainClassifier.java

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputOpt = DefaultOptionCreator.outputOption().create();

    Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create();

    Option minDfOpt = obuilder.withLongName("minDf").withRequired(false)
            .withArgument(abuilder.withName("minDf").withMinimum(1).withMaximum(1).create())
            .withDescription("Minimum Term Document Frequency: 1 ").withShortName("mf").create();

    Option minSupportOpt = obuilder.withLongName("minSupport").withRequired(false)
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("Minimum Support (Term Frequency): 1 ").withShortName("ms").create();

    Option alphaOpt = obuilder.withLongName("alpha").withRequired(false)
            .withArgument(abuilder.withName("a").withMinimum(1).withMaximum(1).create())
            .withDescription("Smoothing parameter Default Value: 1.0").withShortName("a").create();

    Option typeOpt = obuilder.withLongName("classifierType").withRequired(false)
            .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create())
            .withDescription("Type of classifier: bayes|cbayes. Default: bayes").withShortName("type").create();

    Option dataSourceOpt = obuilder.withLongName("dataSource").withRequired(false)
            .withArgument(abuilder.withName("dataSource").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of model: hdfs. Default Value: hdfs").withShortName("source").create();

    Option skipCleanupOpt = obuilder.withLongName("skipCleanup").withRequired(false)
            .withDescription("Skip cleanup of feature extraction output").withShortName("sc").create();

    Option compressOpt = obuilder.withLongName("compress").withRequired(false)
            .withArgument(abuilder.withName("compress").withDefault("0").withMinimum(0).withMaximum(1).create())
            .withDescription("True if the output should be compressed. Default is false").withShortName("comp")
            .create();//from   ww w .j  a v a  2  s  .  c o  m

    Option compressCodecOpt = obuilder.withLongName("codec").withRequired(false)
            .withArgument(abuilder.withName("codec").withDefault("org.apache.hadoop.io.compress.DefaultCodec")
                    .withMinimum(0).withMaximum(1).create())
            .withDescription("Compress codec Default Value: org.apache.hadoop.io.compress.DefaultCodec")
            .withShortName("co").create();

    Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt)
            .withOption(inputDirOpt).withOption(outputOpt).withOption(typeOpt).withOption(dataSourceOpt)
            .withOption(alphaOpt).withOption(minDfOpt).withOption(minSupportOpt).withOption(skipCleanupOpt)
            .withOption(compressOpt).withOption(compressCodecOpt).create();
    try {
        Parser parser = new Parser();

        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String classifierType = (String) cmdLine.getValue(typeOpt);
        String dataSourceType = (String) cmdLine.getValue(dataSourceOpt);

        BayesParameters params = new BayesParameters();
        // Setting all the default parameter values
        params.setGramSize(1);
        params.setMinDF(1);
        params.set("alpha_i", "1.0");
        params.set("dataSource", "hdfs");

        if (cmdLine.hasOption(gramSizeOpt)) {
            params.setGramSize(Integer.parseInt((String) cmdLine.getValue(gramSizeOpt)));
        }

        if (cmdLine.hasOption(minDfOpt)) {
            params.setMinDF(Integer.parseInt((String) cmdLine.getValue(minDfOpt)));
        }

        if (cmdLine.hasOption(minSupportOpt)) {
            params.setMinSupport(Integer.parseInt((String) cmdLine.getValue(minSupportOpt)));
        }

        if (cmdLine.hasOption(skipCleanupOpt)) {
            params.setSkipCleanup(true);
        }

        if (cmdLine.hasOption(alphaOpt)) {
            params.set("alpha_i", (String) cmdLine.getValue(alphaOpt));
        }

        if (cmdLine.hasOption(dataSourceOpt)) {
            params.set("dataSource", dataSourceType);
        }

        if (cmdLine.hasOption(compressOpt) && cmdLine.getValue(compressOpt).toString().equals("1")) {
            params.set("compress", "true");
        } else {
            params.set("compress", "false");
        }

        if (cmdLine.hasOption(compressCodecOpt)) {
            params.set("codec", (String) cmdLine.getValue(compressCodecOpt));
        }

        Path inputPath = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputPath = new Path((String) cmdLine.getValue(outputOpt));
        if ("cbayes".equalsIgnoreCase(classifierType)) {
            log.info("Training Complementary Bayes Classifier");
            trainCNaiveBayes(inputPath, outputPath, params);
        } else {
            log.info("Training Bayes Classifier");
            // setup the HDFS and copy the files there, then run the trainer
            trainNaiveBayes(inputPath, outputPath, params);
        }
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.WikipediaDatasetCreatorDriver.java

/**
 * Takes in two arguments://  w ww  .  j a va 2s .c  o  m
 * <ol>
 * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
 * <li>The output {@link org.apache.hadoop.fs.Path} where to write the classifier as a
 * {@link org.apache.hadoop.io.SequenceFile}</li>
 * </ol>
 */
public static void main(String[] args) throws IOException, InterruptedException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dirInputPathOpt = DefaultOptionCreator.inputOption().create();

    Option dirOutputPathOpt = DefaultOptionCreator.outputOption().create();

    Option categoriesOpt = obuilder.withLongName("categories").withRequired(true)
            .withArgument(abuilder.withName("categories").withMinimum(1).withMaximum(1).create())
            .withDescription("Location of the categories file.  One entry per line. "
                    + "Will be used to make a string match in Wikipedia Category field")
            .withShortName("c").create();

    Option exactMatchOpt = obuilder.withLongName("exactMatch")
            .withDescription("If set, then the category name must exactly match the "
                    + "entry in the categories file. Default is false")
            .withShortName("e").create();
    Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false)
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The analyzer to use, must have a no argument constructor").withShortName("a")
            .create();
    Option helpOpt = DefaultOptionCreator.helpOption();

    Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
            .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    try {
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
        String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
        String catFile = (String) cmdLine.getValue(categoriesOpt);
        Class<? extends Analyzer> analyzerClass = WikipediaAnalyzer.class;
        if (cmdLine.hasOption(analyzerOpt)) {
            String className = cmdLine.getValue(analyzerOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
        }
        runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (ClassNotFoundException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
}

From source file:org.apache.mahout.classifier.bayes.WikipediaXmlSplitter.java

public static void main(String[] args) throws IOException {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
            .withArgument(abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
            .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName("d")
            .create();//from  w w  w .  j ava  2 s.  co  m

    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true)
            .withArgument(abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
            .withDescription("The output directory to place the splits in:\n"
                    + "local files:\n\t/var/data/wikipedia-xml-chunks or\n\tfile:///var/data/wikipedia-xml-chunks\n"
                    + "Hadoop DFS:\n\thdfs://wikipedia-xml-chunks\n"
                    + "AWS S3 (blocks):\n\ts3://bucket-name/wikipedia-xml-chunks\n"
                    + "AWS S3 (native files):\n\ts3n://bucket-name/wikipedia-xml-chunks\n")

            .withShortName("o").create();

    Option s3IdOpt = obuilder.withLongName("s3ID").withRequired(false)
            .withArgument(abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 ID key").withShortName("i").create();
    Option s3SecretOpt = obuilder.withLongName("s3Secret").withRequired(false)
            .withArgument(abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
            .withDescription("Amazon S3 secret key").withShortName("s").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true)
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false)
            .withArgument(abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
            .withShortName("n").create();
    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt)
            .withOption(chunkSizeOpt).withOption(numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt)
            .create();

    Parser parser = new Parser();
    parser.setGroup(group);
    CommandLine cmdLine;
    try {
        cmdLine = parser.parse(args);
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
        CommandLineUtil.printHelp(group);
        return;
    }

    Configuration conf = new Configuration();
    String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
    String outputDirPath = (String) cmdLine.getValue(outputDirOpt);

    if (cmdLine.hasOption(s3IdOpt)) {
        String id = (String) cmdLine.getValue(s3IdOpt);
        conf.set("fs.s3n.awsAccessKeyId", id);
        conf.set("fs.s3.awsAccessKeyId", id);
    }
    if (cmdLine.hasOption(s3SecretOpt)) {
        String secret = (String) cmdLine.getValue(s3SecretOpt);
        conf.set("fs.s3n.awsSecretAccessKey", secret);
        conf.set("fs.s3.awsSecretAccessKey", secret);
    }
    // do not compute crc file when using local FS
    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);

    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));

    int numChunks = Integer.MAX_VALUE;
    if (cmdLine.hasOption(numChunksOpt)) {
        numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
    }

    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
            + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
            + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
            + "http://www.mediawiki.org/xml/export-0.3.xsd\" " + "version=\"0.3\" " + "xml:lang=\"en\">\n"
            + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
            + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
            + "    <generator>MediaWiki 1.13alpha</generator>\n" + "    <case>first-letter</case>\n"
            + "    <namespaces>\n" + "      <namespace key=\"-2\">Media</namespace>\n"
            + "      <namespace key=\"-1\">Special</namespace>\n" + "      <namespace key=\"0\" />\n"
            + "      <namespace key=\"1\">Talk</namespace>\n" + "      <namespace key=\"2\">User</namespace>\n"
            + "      <namespace key=\"3\">User talk</namespace>\n"
            + "      <namespace key=\"4\">Wikipedia</namespace>\n"
            + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
            + "      <namespace key=\"6\">Image</namespace>\n"
            + "      <namespace key=\"7\">Image talk</namespace>\n"
            + "      <namespace key=\"8\">MediaWiki</namespace>\n"
            + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
            + "      <namespace key=\"10\">Template</namespace>\n"
            + "      <namespace key=\"11\">Template talk</namespace>\n"
            + "      <namespace key=\"12\">Help</namespace>\n"
            + "      <namespace key=\"13\">Help talk</namespace>\n"
            + "      <namespace key=\"14\">Category</namespace>\n"
            + "      <namespace key=\"15\">Category talk</namespace>\n"
            + "      <namespace key=\"100\">Portal</namespace>\n"
            + "      <namespace key=\"101\">Portal talk</namespace>\n" + "    </namespaces>\n"
            + "  </siteinfo>\n";

    StringBuilder content = new StringBuilder();
    content.append(header);
    NumberFormat decimalFormatter = new DecimalFormat("0000");
    File dumpFile = new File(dumpFilePath);
    FileLineIterator it;
    if (dumpFilePath.endsWith(".bz2")) {
        // default compression format from http://download.wikimedia.org
        CompressionCodec codec = new BZip2Codec();
        it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
    } else {
        // assume the user has previously de-compressed the dump file
        it = new FileLineIterator(dumpFile);
    }
    int filenumber = 0;
    while (it.hasNext()) {
        String thisLine = it.next();
        if (thisLine.trim().startsWith("<page>")) {
            boolean end = false;
            while (!thisLine.trim().startsWith("</page>")) {
                content.append(thisLine).append('\n');
                if (it.hasNext()) {
                    thisLine = it.next();
                } else {
                    end = true;
                    break;
                }
            }
            content.append(thisLine).append('\n');

            if (content.length() > chunkSize || end) {
                content.append("</mediawiki>");
                filenumber++;
                String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml";
                BufferedWriter chunkWriter = new BufferedWriter(
                        new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
                try {
                    chunkWriter.write(content.toString(), 0, content.length());
                } finally {
                    Closeables.closeQuietly(chunkWriter);
                }
                if (filenumber >= numChunks) {
                    break;
                }
                content = new StringBuilder();
                content.append(header);
            }
        }
    }
}

From source file:org.apache.mahout.classifier.BayesFileFormatter.java

/**
 * Run the FileFormatter/*from   w w  w  .  j a v  a2 s  . c o  m*/
 * 
 * @param args
 *          The input args. Run with -h to see the help
 * @throws ClassNotFoundException
 *           if the Analyzer can't be found
 * @throws IllegalAccessException
 *           if the Analyzer can't be constructed
 * @throws InstantiationException
 *           if the Analyzer can't be constructed
 * @throws IOException
 *           if the files can't be dealt with properly
 */
public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = DefaultOptionCreator.inputOption().create();

    Option outputOpt = DefaultOptionCreator.outputOption().create();

    Option labelOpt = obuilder.withLongName("label").withRequired(true)
            .withArgument(abuilder.withName("label").withMinimum(1).withMaximum(1).create())
            .withDescription("The label of the file").withShortName("l").create();

    Option analyzerOpt = obuilder.withLongName("analyzer")
            .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
            .withDescription("The fully qualified class name of the analyzer to use. "
                    + "Must have a no-arg constructor.  Default is the StandardAnalyzer")
            .withShortName("a").create();

    Option charsetOpt = obuilder.withLongName("charset")
            .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
            .withDescription("The character encoding of the input file").withShortName("c").create();

    Option collapseOpt = obuilder.withLongName("collapse").withRequired(true)
            .withArgument(abuilder.withName("collapse").withMinimum(1).withMaximum(1).create())
            .withDescription("Collapse a whole directory to a single file, one doc per line").withShortName("p")
            .create();

    Option helpOpt = DefaultOptionCreator.helpOption();
    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt)
            .withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt)
            .create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {

            return;
        }
        File input = new File((String) cmdLine.getValue(inputOpt));
        File output = new File((String) cmdLine.getValue(outputOpt));
        String label = (String) cmdLine.getValue(labelOpt);
        Analyzer analyzer;
        if (cmdLine.hasOption(analyzerOpt)) {
            analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class);
        } else {
            analyzer = new StandardAnalyzer(Version.LUCENE_31);
        }
        Charset charset = Charsets.UTF_8;
        if (cmdLine.hasOption(charsetOpt)) {
            charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
        }
        boolean collapse = cmdLine.hasOption(collapseOpt);

        if (collapse) {
            collapse(label, analyzer, input, charset, output);
        } else {
            format(label, analyzer, input, charset, output);
        }

    } catch (OptionException e) {
        log.error("Exception", e);
    }
}

From source file:org.apache.mahout.classifier.chi_rwcs.mapreduce.BuildModel.java

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option dataOpt = obuilder.withLongName("data").withShortName("d").withRequired(true)
            .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create())
            .withDescription("Data path").create();

    Option datasetOpt = obuilder.withLongName("dataset").withShortName("ds").withRequired(true)
            .withArgument(abuilder.withName("dataset").withMinimum(1).withMaximum(1).create())
            .withDescription("The path of the file descriptor of the dataset").create();

    Option timeOpt = obuilder.withLongName("time").withShortName("tm").withRequired(false)
            .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create())
            .withDescription("Time path").create();

    Option outputOpt = obuilder.withLongName("output").withShortName("o").withRequired(true)
            .withArgument(abuilder.withName("path").withMinimum(1).withMaximum(1).create())
            .withDescription("Output path, will contain the Decision Forest").create();

    Option labelsOpt = obuilder.withLongName("labels").withShortName("l").withRequired(true)
            .withArgument(abuilder.withName("labels").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of Labels").create();

    Option combinationTypeOpt = obuilder.withLongName("combinationType").withShortName("t").withRequired(true)
            .withArgument(abuilder.withName("combinationType").withMinimum(1).withMaximum(1).create())
            .withDescription("T-norm for the computation of the compatibility degree").create();

    Option rule_weightOpt = obuilder.withLongName("rule_weight").withShortName("r").withRequired(true)
            .withArgument(abuilder.withName("rule_weight").withMinimum(1).withMaximum(1).create())
            .withDescription("Rule Weight").create();

    Option fuzzy_r_mOpt = obuilder.withLongName("fuzzy_r_m").withShortName("f").withRequired(true)
            .withArgument(abuilder.withName("fuzzy_r_m").withMinimum(1).withMaximum(1).create())
            .withDescription("Fuzzy Reasoning Method").create();

    Option helpOpt = obuilder.withLongName("help").withShortName("h").withDescription("Print out help")
            .create();/*from  www.  j a va 2 s .c  om*/

    Group group = gbuilder.withName("Options").withOption(dataOpt).withOption(datasetOpt).withOption(timeOpt)
            .withOption(outputOpt).withOption(labelsOpt).withOption(combinationTypeOpt)
            .withOption(rule_weightOpt).withOption(fuzzy_r_mOpt).withOption(helpOpt).create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption("help")) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        dataName = cmdLine.getValue(dataOpt).toString();
        String datasetName = cmdLine.getValue(datasetOpt).toString();
        String outputName = cmdLine.getValue(outputOpt).toString();
        nLabels = Integer.parseInt(cmdLine.getValue(labelsOpt).toString());
        String combinationType_aux = cmdLine.getValue(combinationTypeOpt).toString();
        String ruleWeight_aux = cmdLine.getValue(rule_weightOpt).toString();
        String inferenceType_aux = cmdLine.getValue(fuzzy_r_mOpt).toString();

        if (cmdLine.hasOption(timeOpt)) {
            buildTimeIsStored = true;
            timeName = cmdLine.getValue(timeOpt).toString();
        }

        if (log.isDebugEnabled()) {
            log.debug("data : {}", dataName);
            log.debug("dataset : {}", datasetName);
            log.debug("output : {}", outputName);
            log.debug("labels : {}", nLabels);
            log.debug("t_norm : {}", combinationType_aux);
            log.debug("rule_weight : {}", ruleWeight_aux);
            log.debug("fuzzy_r_m : {}", inferenceType_aux);
            log.debug("time : {}", timeName);
        }

        dataPath = new Path(dataName);
        datasetPath = new Path(datasetName);
        outputPath = new Path(outputName);
        if (buildTimeIsStored)
            timePath = new Path(timeName);

        combinationType = PRODUCT;
        if (combinationType_aux.compareToIgnoreCase("minimum") == 0) {
            combinationType = MINIMUM;
        }

        ruleWeight = PCF_IV;
        if (ruleWeight_aux.compareToIgnoreCase("Certainty_Factor") == 0) {
            ruleWeight = CF;
        } else if (ruleWeight_aux.compareToIgnoreCase("Average_Penalized_Certainty_Factor") == 0) {
            ruleWeight = PCF_II;
        } else if (ruleWeight_aux.compareToIgnoreCase("No_Weights") == 0) {
            ruleWeight = NO_RW;
        }

        inferenceType = WINNING_RULE;
        if (inferenceType_aux.compareToIgnoreCase("Additive_Combination") == 0) {
            inferenceType = ADDITIVE_COMBINATION;
        }

    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
        return -1;
    }

    buildModel();

    return 0;
}