Example usage for org.apache.mahout.common.commandline DefaultOptionCreator numReducersOption

List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator numReducersOption

Introduction

In this page you can find the example usage for org.apache.mahout.common.commandline DefaultOptionCreator numReducersOption.

Prototype

@Deprecated
public static DefaultOptionBuilder numReducersOption() 

Source Link

Document

Returns a default command line option for specifying the max number of reducers.

Usage

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   ww w .  j a  v a2  s.  c  o  m*/
    addOutputOption();
    addOption(DefaultOptionCreator.numReducersOption().create());

    addOption("maxNGramSize", "ng",
            "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2",
            String.valueOf(DEFAULT_MAX_NGRAM_SIZE));
    addOption("minSupport", "s",
            "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT,
            String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT));
    addOption("minValue", "minV", "(Optional)The minimum value for association metric(Float)  Default is "
            + AssocReducer.DEFAULT_MIN_VALUE, String.valueOf(AssocReducer.DEFAULT_MIN_VALUE));
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption("metric", "m", "The association metric to use, one of {llr,dice,pmi,chi}",
            AssocReducer.DEFAULT_ASSOC);
    addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations");
    addOption("windowSize", "ws", "(Optional) Window size");
    addOption("windowMode", "wm", "(Optional) DOCUMENT, SENTENCE, S_WINDOW, C_WINDOW, FIXED");
    addOption("ngramLimit", "nl", "(Optional) maximum of ngrams per unit - to prevent memory overflow");
    addOption("usePos", "p", "(Optional)");
    Map<String, List<String>> argMap = parseArguments(args);

    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();

    int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
    if (hasOption("maxNGramSize")) {
        try {
            maxNGramSize = Integer.parseInt(getOption("maxNGramSize"));
        } catch (NumberFormatException ex) {
            log.warn("Could not parse ngram size option");
        }
    }
    log.info("Maximum n-gram size is: {}", maxNGramSize);

    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
    if (getOption("minSupport") != null) {
        minSupport = Integer.parseInt(getOption("minSupport"));
    }
    log.info("Minimum Support value: {}", minSupport);

    float minValue = AssocReducer.DEFAULT_MIN_VALUE;
    if (getOption("minValue") != null) {
        minValue = Float.parseFloat(getOption("minValue"));
    }
    log.info("Minimum Assoc value: {}", minValue);

    int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
    if (getOption("maxRed") != null) {
        reduceTasks = Integer.parseInt(getOption("maxRed"));
    }
    log.info("Number of pass1 reduce tasks: {}", reduceTasks);

    String metric = AssocReducer.DEFAULT_ASSOC;
    if (getOption("metric") != null) {
        metric = getOption("metric");
    }
    log.info("Association Metric: {}", metric);
    Window windowType = Window.SENTENCE;
    if (getOption("windowMode") != null) {
        windowType = Window.valueOf(getOption("windowMode").toUpperCase());
    }
    int windowSize = 3;
    if (getOption("windowSize") != null) {
        windowSize = Integer.parseInt(getOption("windowSize"));
    }

    boolean emitUnigrams = argMap.containsKey("emitUnigrams");
    reduceTasks = 14;
    // parse input and extract collocations
    long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks,
            minSupport, windowType, windowSize);

    // tally collocations and perform LLR calculation
    // for (String m : metric.split(",")) {
    // log.info("Computing Collocations with Association Metric: {}", m);
    // // extract pruning thresholds
    // if (m.contains(":")) {
    // String[] tokens = m.split(":");
    // m = tokens[0];
    // minValue = Float.parseFloat(tokens[1]);
    // }

    computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minValue, reduceTasks);
    // only emit unigrams for the first metric
    emitUnigrams = false;
    // }
    return 0;
}

From source file:edu.rosehulman.CollocDriver.java

License:Apache License

@SuppressWarnings("deprecation")
public int run(String[] args) throws Exception {
    addInputOption();//  w  w w. ja  v  a2  s .c o  m
    addOutputOption();
    addOption(DefaultOptionCreator.numReducersOption().create());

    addOption("maxNGramSize", "ng",
            "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2",
            String.valueOf(DEFAULT_MAX_NGRAM_SIZE));
    addOption("minSupport", "s",
            "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT,
            String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT));
    addOption("minLLR", "ml",
            "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + LLRReducer.DEFAULT_MIN_LLR,
            String.valueOf(LLRReducer.DEFAULT_MIN_LLR));
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption("analyzerName", "a", "The class name of the analyzer to use for preprocessing", null);

    addFlag("preprocess", "p", "If set, input is SequenceFile<Text,Text> where the value is the document, "
            + " which will be tokenized using the specified analyzer.");
    addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations");

    Map<String, List<String>> argMap = parseArguments(args);

    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();

    int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
    if (hasOption("maxNGramSize")) {
        try {
            maxNGramSize = Integer.parseInt(getOption("maxNGramSize"));
        } catch (NumberFormatException ex) {
            log.warn("Could not parse ngram size option");
        }
    }
    log.info("Maximum n-gram size is: {}", maxNGramSize);

    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
    if (getOption("minSupport") != null) {
        minSupport = Integer.parseInt(getOption("minSupport"));
    }
    log.info("Minimum Support value: {}", minSupport);

    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
    if (getOption("minLLR") != null) {
        minLLRValue = Float.parseFloat(getOption("minLLR"));
    }
    log.info("Minimum LLR value: {}", minLLRValue);

    int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
    if (getOption("maxRed") != null) {
        reduceTasks = Integer.parseInt(getOption("maxRed"));
    }
    log.info("Number of pass1 reduce tasks: {}", reduceTasks);

    boolean emitUnigrams = argMap.containsKey("emitUnigrams");

    if (argMap.containsKey("preprocess")) {
        log.info("Input will be preprocessed");
        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (getOption("analyzerName") != null) {
            String className = getOption("analyzerName");
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

        DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath, getConf());
        input = tokenizedPath;
    } else {
        log.info("Input will NOT be preprocessed");
    }

    // parse input and extract collocations
    long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks,
            minSupport);

    // tally collocations and perform LLR calculation
    computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minLLRValue, reduceTasks);

    return 0;
}