Example usage for org.apache.mahout.vectorizer.collocations.llr CollocDriver generateAllGrams

List of usage examples for org.apache.mahout.vectorizer.collocations.llr CollocDriver generateAllGrams

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer.collocations.llr CollocDriver generateAllGrams.

Prototype

public static void generateAllGrams(Path input, Path output, Configuration baseConf, int maxNGramSize,
        int minSupport, float minLLRValue, int reduceTasks)
        throws IOException, InterruptedException, ClassNotFoundException 

Source Link

Document

Generate all ngrams for the org.apache.mahout.vectorizer.DictionaryVectorizer job

Usage

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
 * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
 * multiple map/reduces./*from   w w w  .  j a v  a 2 s .c  om*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param tfVectorsFolderName
 *          The name of the folder in which the final output vectors will be stored
 * @param baseConf
 *          job configuration
 * @param normPower
 *          L_p norm to be computed
 * @param logNormalize
 *          whether to use log normalization         
 * @param minSupport
 *          the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
 *          sparse vector
 * @param maxNGramSize
 *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
 * @param minLLRValue
 *          minValue of log likelihood ratio to used to prune ngrams
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 */
public static void createTermFrequencyVectors(Path input, Path output, String tfVectorsFolderName,
        Configuration baseConf, int minSupport, int maxNGramSize, float minLLRValue, float normPower,
        boolean logNormalize, int numReducers, int chunkSizeInMegabytes, boolean sequentialAccess,
        boolean namedVectors) throws IOException, InterruptedException, ClassNotFoundException {
    Preconditions.checkArgument(normPower == PartialVectorMerger.NO_NORMALIZING || normPower >= 0,
            "If specified normPower must be nonnegative", normPower);
    Preconditions.checkArgument(
            normPower == PartialVectorMerger.NO_NORMALIZING || (normPower > 1 && !Double.isInfinite(normPower))
                    || !logNormalize,
            "normPower must be > 1 and not infinite if log normalization is chosen", normPower);
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }
    if (minSupport < 0) {
        minSupport = DEFAULT_MIN_SUPPORT;
    }

    Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);

    int[] maxTermDimension = new int[1];
    List<Path> dictionaryChunks;
    if (maxNGramSize == 1) {
        startWordCounting(input, dictionaryJobPath, baseConf, minSupport);
        dictionaryChunks = createDictionaryChunks(dictionaryJobPath, output, baseConf, chunkSizeInMegabytes,
                maxTermDimension);
    } else {
        CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue,
                numReducers);
        dictionaryChunks = createDictionaryChunks(
                new Path(new Path(output, DICTIONARY_JOB_FOLDER), CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
                baseConf, chunkSizeInMegabytes, maxTermDimension);
    }

    int partialVectorIndex = 0;
    Collection<Path> partialVectorPaths = Lists.newArrayList();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, baseConf, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
                maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
    }

    Configuration conf = new Configuration(baseConf);

    Path outputDir = new Path(output, tfVectorsFolderName);
    PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, conf, normPower, logNormalize,
            maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
    HadoopUtil.delete(conf, partialVectorPaths);
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Create Term Frequency (Tf) Vectors from the input set of documents in {@link SequenceFile} format. This
 * tries to fix the maximum memory used by the feature chunk per node thereby splitting the process across
 * multiple map/reduces.//from  ww  w. ja  v  a2  s .c o  m
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param output
 *          output directory where {@link org.apache.mahout.math.RandomAccessSparseVector}'s of the document
 *          are generated
 * @param tfVectorsFolderName
 *          The name of the folder in which the final output vectors will be stored
 * @param baseConf
 *          job configuration
 * @param normPower
 *          L_p norm to be computed
 * @param logNormalize
 *          whether to use log normalization         
 * @param minSupport
 *          the minimum frequency of the feature in the entire corpus to be considered for inclusion in the
 *          sparse vector
 * @param maxNGramSize
 *          1 = unigram, 2 = unigram and bigram, 3 = unigram, bigram and trigram
 * @param minLLRValue
 *          minValue of log likelihood ratio to used to prune ngrams
 * @param chunkSizeInMegabytes
 *          the size in MB of the feature => id chunk to be kept in memory at each node during Map/Reduce
 *          stage. Its recommended you calculated this based on the number of cores and the free memory
 *          available to you per node. Say, you have 2 cores and around 1GB extra memory to spare we
 *          recommend you use a split size of around 400-500MB so that two simultaneous reducers can create
 *          partial vectors without thrashing the system due to increased swapping
 */
public static void createTermFrequencyVectors(Path input, Path output, String tfVectorsFolderName,
        Configuration baseConf, int minSupport, int maxNGramSize, float minLLRValue, float normPower,
        boolean logNormalize, int numReducers, int chunkSizeInMegabytes, boolean sequentialAccess,
        boolean namedVectors) throws IOException, InterruptedException, ClassNotFoundException {
    Preconditions.checkArgument(normPower == PartialVectorMerger.NO_NORMALIZING || normPower >= 0,
            "If specified normPower must be nonnegative", normPower);
    Preconditions.checkArgument(
            normPower == PartialVectorMerger.NO_NORMALIZING || (normPower > 1 && !Double.isInfinite(normPower))
                    || !logNormalize,
            "normPower must be > 1 and not infinite if log normalization is chosen", normPower);
    if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
        chunkSizeInMegabytes = MIN_CHUNKSIZE;
    } else if (chunkSizeInMegabytes > MAX_CHUNKSIZE) { // 10GB
        chunkSizeInMegabytes = MAX_CHUNKSIZE;
    }
    if (minSupport < 0) {
        minSupport = DEFAULT_MIN_SUPPORT;
    }

    //???
    Path dictFilePath = new Path(PropertiesUtil.getBackUpDir(), DICTIONARY_FILE_FOLDER);

    Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);

    int[] maxTermDimension = new int[1];
    List<Path> dictionaryChunks;
    if (maxNGramSize == 1) {
        startWordCounting(input, dictionaryJobPath, baseConf, minSupport);
        dictionaryChunks = createDictionaryChunks(dictFilePath, output, baseConf, chunkSizeInMegabytes,
                maxTermDimension);
    } else {
        CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue,
                numReducers);
        //?????ngram?
        dictionaryChunks = createDictionaryChunks(dictFilePath, output, baseConf, chunkSizeInMegabytes,
                maxTermDimension);
    }

    int partialVectorIndex = 0;
    Collection<Path> partialVectorPaths = Lists.newArrayList();
    for (Path dictionaryChunk : dictionaryChunks) {
        Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
        partialVectorPaths.add(partialVectorOutputPath);
        makePartialVectors(input, baseConf, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
                maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
    }

    Configuration conf = new Configuration(baseConf);

    Path outputDir = new Path(output, tfVectorsFolderName);
    PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, conf, normPower, logNormalize,
            maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
    HadoopUtil.delete(conf, partialVectorPaths);
}