Example usage for org.apache.mahout.vectorizer.common PartialVectorMerger NAMED_VECTOR

List of usage examples for org.apache.mahout.vectorizer.common PartialVectorMerger NAMED_VECTOR

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer.common PartialVectorMerger NAMED_VECTOR.

Prototype

String NAMED_VECTOR

To view the source code for org.apache.mahout.vectorizer.common PartialVectorMerger NAMED_VECTOR.

Click Source Link

Usage

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format//from   www . j a v a 2 s  .c  o m
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format//from w w w.  ja  v  a2s . com
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    URI[] localFiles = DistributedCache.getCacheFiles(conf);
    Preconditions.checkArgument(localFiles != null && localFiles.length >= 1,
            "missing paths from the DistributedCache");

    dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
    sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
    maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);

    Path dictionaryFile = new Path(localFiles[0].getPath());
    // key is word value is id
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile,
            true, conf)) {/*from w  w  w.j a v a 2s  .  c  om*/
        dictionary.put(record.getFirst().toString(), record.getSecond().get());
    }
}

From source file:edu.rosehulman.TFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();

    dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
    sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
    maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize);

    //MAHOUT-1247
    Path dictionaryFile = HadoopUtil.getSingleCachedFile(conf);
    // key is word value is id
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile,
            true, conf)) {// w w w. j  a v  a  2  s.  c om
        dictionary.put(record.getFirst().toString(), record.getSecond().get());
    }
}