List of usage examples for org.apache.mahout.vectorizer.collocations.llr CollocReducer DEFAULT_MIN_SUPPORT
int DEFAULT_MIN_SUPPORT
To view the source code for org.apache.mahout.vectorizer.collocations.llr CollocReducer DEFAULT_MIN_SUPPORT.
Click Source Link
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
@SuppressWarnings("deprecation") public int run(String[] args) throws Exception { addInputOption();/*from ww w . j ava 2 s. c om*/ addOutputOption(); addOption(DefaultOptionCreator.numReducersOption().create()); addOption("maxNGramSize", "ng", "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2", String.valueOf(DEFAULT_MAX_NGRAM_SIZE)); addOption("minSupport", "s", "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT, String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT)); addOption("minLLR", "ml", "(Optional)The minimum Log Likelihood Ratio(Float) Default is " + LLRReducer.DEFAULT_MIN_LLR, String.valueOf(LLRReducer.DEFAULT_MIN_LLR)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption("analyzerName", "a", "The class name of the analyzer to use for preprocessing", null); addFlag("preprocess", "p", "If set, input is SequenceFile<Text,Text> where the value is the document, " + " which will be tokenized using the specified analyzer."); addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations"); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE; if (hasOption("maxNGramSize")) { try { maxNGramSize = Integer.parseInt(getOption("maxNGramSize")); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT; if (getOption("minSupport") != null) { minSupport = Integer.parseInt(getOption("minSupport")); } log.info("Minimum Support value: {}", minSupport); float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; if (getOption("minLLR") != null) { minLLRValue = Float.parseFloat(getOption("minLLR")); } log.info("Minimum LLR value: {}", minLLRValue); int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS; if (getOption("maxRed") != null) { reduceTasks = Integer.parseInt(getOption("maxRed")); } log.info("Number of pass1 reduce tasks: {}", reduceTasks); boolean emitUnigrams = argMap.containsKey("emitUnigrams"); if (argMap.containsKey("preprocess")) { log.info("Input will be preprocessed"); Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class; if (getOption("analyzerName") != null) { String className = getOption("analyzerName"); analyzerClass = Class.forName(className).asSubclass(Analyzer.class); // try instantiating it, b/c there isn't any point in setting it if // you can't instantiate it AnalyzerUtils.createAnalyzer(analyzerClass); } Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER); DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath, getConf()); input = tokenizedPath; } else { log.info("Input will NOT be preprocessed"); } // parse input and extract collocations long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks, minSupport); // tally collocations and perform LLR calculation computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minLLRValue, reduceTasks); return 0; }