Example usage for org.apache.mahout.vectorizer.collocations.llr CollocReducer DEFAULT_MIN_SUPPORT

List of usage examples for org.apache.mahout.vectorizer.collocations.llr CollocReducer DEFAULT_MIN_SUPPORT

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer.collocations.llr CollocReducer DEFAULT_MIN_SUPPORT.

Prototype

int DEFAULT_MIN_SUPPORT

To view the source code for org.apache.mahout.vectorizer.collocations.llr CollocReducer DEFAULT_MIN_SUPPORT.

Click Source Link

Usage

From source file:edu.rosehulman.CollocDriver.java

License:Apache License

@SuppressWarnings("deprecation")
public int run(String[] args) throws Exception {
    addInputOption();/*from ww  w  .  j  ava 2 s.  c om*/
    addOutputOption();
    addOption(DefaultOptionCreator.numReducersOption().create());

    addOption("maxNGramSize", "ng",
            "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2",
            String.valueOf(DEFAULT_MAX_NGRAM_SIZE));
    addOption("minSupport", "s",
            "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT,
            String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT));
    addOption("minLLR", "ml",
            "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + LLRReducer.DEFAULT_MIN_LLR,
            String.valueOf(LLRReducer.DEFAULT_MIN_LLR));
    addOption(DefaultOptionCreator.overwriteOption().create());
    addOption("analyzerName", "a", "The class name of the analyzer to use for preprocessing", null);

    addFlag("preprocess", "p", "If set, input is SequenceFile<Text,Text> where the value is the document, "
            + " which will be tokenized using the specified analyzer.");
    addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations");

    Map<String, List<String>> argMap = parseArguments(args);

    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();

    int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
    if (hasOption("maxNGramSize")) {
        try {
            maxNGramSize = Integer.parseInt(getOption("maxNGramSize"));
        } catch (NumberFormatException ex) {
            log.warn("Could not parse ngram size option");
        }
    }
    log.info("Maximum n-gram size is: {}", maxNGramSize);

    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }

    int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
    if (getOption("minSupport") != null) {
        minSupport = Integer.parseInt(getOption("minSupport"));
    }
    log.info("Minimum Support value: {}", minSupport);

    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
    if (getOption("minLLR") != null) {
        minLLRValue = Float.parseFloat(getOption("minLLR"));
    }
    log.info("Minimum LLR value: {}", minLLRValue);

    int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
    if (getOption("maxRed") != null) {
        reduceTasks = Integer.parseInt(getOption("maxRed"));
    }
    log.info("Number of pass1 reduce tasks: {}", reduceTasks);

    boolean emitUnigrams = argMap.containsKey("emitUnigrams");

    if (argMap.containsKey("preprocess")) {
        log.info("Input will be preprocessed");
        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (getOption("analyzerName") != null) {
            String className = getOption("analyzerName");
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        Path tokenizedPath = new Path(output, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);

        DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath, getConf());
        input = tokenizedPath;
    } else {
        log.info("Input will NOT be preprocessed");
    }

    // parse input and extract collocations
    long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks,
            minSupport);

    // tally collocations and perform LLR calculation
    computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minLLRValue, reduceTasks);

    return 0;
}