Example usage for org.apache.mahout.vectorizer DictionaryVectorizer MAX_NGRAMS

List of usage examples for org.apache.mahout.vectorizer DictionaryVectorizer MAX_NGRAMS

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer DictionaryVectorizer MAX_NGRAMS.

Prototype

String MAX_NGRAMS

To view the source code for org.apache.mahout.vectorizer DictionaryVectorizer MAX_NGRAMS.

Click Source Link

Usage

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    URI[] localFiles = DistributedCache.getCacheFiles(conf);
    Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
            "missing paths from the DistributedCache");

    dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
    sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
    maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);

    Path dictionaryFile = new Path(localFiles[0].getPath());
    // key is word value is id
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile,
            true, conf)) {//from  w w  w.j  ava2  s  . com
        dictionary.put(record.getFirst().toString(), record.getSecond().get());
    }
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();

    dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
    sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
    maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);

    //MAHOUT-1247
    Path dictionaryFile = HadoopUtil.getSingleCachedFile(conf);
    // key is word value is id
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile,
            true, conf)) {//from w  w  w  . ja v  a  2s .  com
        dictionary.put(record.getFirst().toString(), record.getSecond().get());
    }
}