List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator numReducersOption
@Deprecated public static DefaultOptionBuilder numReducersOption()
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w . j a v a2 s. c o m*/ addOutputOption(); addOption(DefaultOptionCreator.numReducersOption().create()); addOption("maxNGramSize", "ng", "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2", String.valueOf(DEFAULT_MAX_NGRAM_SIZE)); addOption("minSupport", "s", "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT, String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT)); addOption("minValue", "minV", "(Optional)The minimum value for association metric(Float) Default is " + AssocReducer.DEFAULT_MIN_VALUE, String.valueOf(AssocReducer.DEFAULT_MIN_VALUE)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption("metric", "m", "The association metric to use, one of {llr,dice,pmi,chi}", AssocReducer.DEFAULT_ASSOC); addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations"); addOption("windowSize", "ws", "(Optional) Window size"); addOption("windowMode", "wm", "(Optional) DOCUMENT, SENTENCE, S_WINDOW, C_WINDOW, FIXED"); addOption("ngramLimit", "nl", "(Optional) maximum of ngrams per unit - to prevent memory overflow"); addOption("usePos", "p", "(Optional)"); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE; if (hasOption("maxNGramSize")) { try { maxNGramSize = Integer.parseInt(getOption("maxNGramSize")); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT; if (getOption("minSupport") != null) { minSupport = Integer.parseInt(getOption("minSupport")); } log.info("Minimum Support value: {}", minSupport); float minValue = AssocReducer.DEFAULT_MIN_VALUE; if (getOption("minValue") != null) { minValue = Float.parseFloat(getOption("minValue")); } log.info("Minimum Assoc value: {}", minValue); int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS; if (getOption("maxRed") != null) { reduceTasks = Integer.parseInt(getOption("maxRed")); } log.info("Number of pass1 reduce tasks: {}", reduceTasks); String metric = AssocReducer.DEFAULT_ASSOC; if (getOption("metric") != null) { metric = getOption("metric"); } log.info("Association Metric: {}", metric); Window windowType = Window.SENTENCE; if (getOption("windowMode") != null) { windowType = Window.valueOf(getOption("windowMode").toUpperCase()); } int windowSize = 3; if (getOption("windowSize") != null) { windowSize = Integer.parseInt(getOption("windowSize")); } boolean emitUnigrams = argMap.containsKey("emitUnigrams"); reduceTasks = 14; // parse input and extract collocations long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks, minSupport, windowType, windowSize); // tally collocations and perform LLR calculation // for (String m : metric.split(",")) { // log.info("Computing Collocations with Association Metric: {}", m); // // extract pruning thresholds // if (m.contains(":")) { // String[] tokens = m.split(":"); // m = tokens[0]; // minValue = Float.parseFloat(tokens[1]); // } computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minValue, reduceTasks); // only emit unigrams for the first metric emitUnigrams = false; // } return 0; }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
@SuppressWarnings("deprecation") public int run(String[] args) throws Exception { addInputOption();// w w w. ja v a2 s .c o m addOutputOption(); addOption(DefaultOptionCreator.numReducersOption().create()); addOption("maxNGramSize", "ng", "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2", String.valueOf(DEFAULT_MAX_NGRAM_SIZE)); addOption("minSupport", "s", "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT, String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT)); addOption("minLLR", "ml", "(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR, String.valueOf(LLRReducer.DEFAULT_MIN_LLR)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption("analyzerName", "a", "The class name of the analyzer to use for preprocessing", null); addFlag("preprocess", "p", "If set, input is SequenceFile<Text,Text> where the value is the document, " + " which will be tokenized using the specified analyzer."); addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations"); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE; if (hasOption("maxNGramSize")) { try { maxNGramSize = Integer.parseInt(getOption("maxNGramSize")); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT; if (getOption("minSupport") != null) { minSupport = Integer.parseInt(getOption("minSupport")); } log.info("Minimum Support value: {}", minSupport); float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; if (getOption("minLLR") != null) { minLLRValue = Float.parseFloat(getOption("minLLR")); } log.info("Minimum LLR value: {}", minLLRValue); int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS; if (getOption("maxRed") != null) { reduceTasks = Integer.parseInt(getOption("maxRed")); } log.info("Number of pass1 reduce tasks: {}", reduceTasks); boolean emitUnigrams = argMap.containsKey("emitUnigrams"); if (argMap.containsKey("preprocess")) { log.info("Input will be preprocessed"); Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class; if (getOption("analyzerName") != null) { String className = getOption("analyzerName"); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it AnalyzerUtils.createAnalyzer(analyzerClass); } Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath, getConf()); input = tokenizedPath; } else { log.info("Input will NOT be preprocessed"); } // parse input and extract collocations long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks, minSupport); // tally collocations and perform LLR calculation computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minLLRValue, reduceTasks); return 0; }