Example usage for org.apache.hadoop.mapreduce Job setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the job output data.

Usage

From source file:com.dlyle.testh20.H2OTest.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();
    Job job = Job.getInstance(conf, "H2O Test");
    job.setJarByClass(H2OTest.class);
    job.setMapperClass(H2OMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.elephantscale.hbase.book.chapter1.SimpleMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: SimpleMR <in> <out>");
        return;/* w w  w  .  j a v  a2  s.c  o  m*/
    }
    Job job = new Job(conf, "SimpleMR");
    job.setJarByClass(SimpleMR.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "Calculating perplexity for " + modelPath;
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CachingCVB0PerplexityMapper.class);
    job.setMapperClass(CachingCVB0PerplexityMapper.class);
    job.setCombinerClass(DualDoubleSumReducer.class);
    job.setReducerClass(DualDoubleSumReducer.class);
    job.setNumReduceTasks(1);/*from  w  w  w. j  av  a 2s .c o  m*/
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusPath);
    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
    FileOutputFormat.setOutputPath(job, outputPath);
    setModelPaths(job, modelPath);
    HadoopUtil.delete(conf, outputPath);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
    }
    return readPerplexity(conf, modelPath.getParent(), iteration);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeTopicModel(Configuration conf, Path modelInput, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CVB0Driver.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class);
    job.setNumReduceTasks(0);/*from  w w  w.  java  2  s. c om*/
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, modelInput);
    FileOutputFormat.setOutputPath(job, output);
    job.submit();
    return job;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setMapperClass(CVB0DocInferenceMapper.class);
    job.setNumReduceTasks(0);/* w  ww.j a  va2  s. c  o  m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileSystem fs = FileSystem.get(corpus.toUri(), conf);
    if (modelInput != null && fs.exists(modelInput)) {
        FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
        URI[] modelUris = new URI[statuses.length];
        for (int i = 0; i < statuses.length; i++) {
            modelUris[i] = statuses[i].getPath().toUri();
        }
        DistributedCache.setCacheFiles(modelUris, conf);
    }
    setModelPaths(job, modelInput);//bug:mahout-1147
    FileInputFormat.addInputPath(job, corpus);
    FileOutputFormat.setOutputPath(job, output);
    job.setJarByClass(CVB0Driver.class);
    job.submit();
    return job;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
        int iterationNumber, int maxIterations, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations,
            modelInput);/* w w  w.ja  v a  2s.com*/
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CVB0Driver.class);
    job.setMapperClass(CachingCVB0Mapper.class);
    job.setCombinerClass(VectorSumReducer.class);
    job.setReducerClass(VectorSumReducer.class);
    job.setNumReduceTasks(numReduceTasks);
    job.setOutputKeyClass(Text.class);//0.7IntWritable
    job.setOutputValueClass(VectorWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusInput);
    FileOutputFormat.setOutputPath(job, modelOutput);
    setModelPaths(job, modelInput);
    HadoopUtil.delete(conf, modelOutput);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException(
                String.format("Failed to complete iteration %d stage 1", iterationNumber));
    }
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format//from  www. j a  va2s. c o  m
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format/* w  w  w.java  2  s .  c om*/
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format//from   w w w. j  a  va2  s  . c om
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format//from w  w w  . j  a v a 2  s  .co  m
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}