Example usage for org.apache.hadoop.mapreduce Job setOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputKeyClass.

Prototype

public void setOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the job output data.

Usage

From source file:com.dlyle.testh20.H2OTest.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();
    Job job = Job.getInstance(conf, "H2O Test");
    job.setJarByClass(H2OTest.class);
    job.setMapperClass(H2OMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.elephantscale.hbase.book.chapter1.SimpleMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: SimpleMR <in> <out>");
        return;/* w w  w  .  j a v  a2  s.c  o  m*/
    }
    Job job = new Job(conf, "SimpleMR");
    job.setJarByClass(SimpleMR.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    job.waitForCompletion(true);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = "Calculating perplexity for " + modelPath;
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CachingCVB0PerplexityMapper.class);
    job.setMapperClass(CachingCVB0PerplexityMapper.class);
    job.setCombinerClass(DualDoubleSumReducer.class);
    job.setReducerClass(DualDoubleSumReducer.class);
    job.setNumReduceTasks(1);/*from  w  w  w. j  av  a 2s .c o  m*/
    job.setOutputKeyClass(DoubleWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusPath);
    Path outputPath = perplexityPath(modelPath.getParent(), iteration);
    FileOutputFormat.setOutputPath(job, outputPath);
    setModelPaths(job, modelPath);
    HadoopUtil.delete(conf, outputPath);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
    }
    return readPerplexity(conf, modelPath.getParent(), iteration);
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeTopicModel(Configuration conf, Path modelInput, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {
    String jobName = String.format("Writing final topic/term distributions from %s to %s", modelInput, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CVB0Driver.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CVB0TopicTermVectorNormalizerMapper.class);
    job.setNumReduceTasks(0);/*from  w w  w.  java  2  s. c om*/
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, modelInput);
    FileOutputFormat.setOutputPath(job, output);
    job.submit();
    return job;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setMapperClass(CVB0DocInferenceMapper.class);
    job.setNumReduceTasks(0);/* w  ww.j a  va2  s. c  o  m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileSystem fs = FileSystem.get(corpus.toUri(), conf);
    if (modelInput != null && fs.exists(modelInput)) {
        FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
        URI[] modelUris = new URI[statuses.length];
        for (int i = 0; i < statuses.length; i++) {
            modelUris[i] = statuses[i].getPath().toUri();
        }
        DistributedCache.setCacheFiles(modelUris, conf);
    }
    setModelPaths(job, modelInput);//bug:mahout-1147
    FileInputFormat.addInputPath(job, corpus);
    FileOutputFormat.setOutputPath(job, output);
    job.setJarByClass(CVB0Driver.class);
    job.submit();
    return job;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

public static void runIteration(Configuration conf, Path corpusInput, Path modelInput, Path modelOutput,
        int iterationNumber, int maxIterations, int numReduceTasks)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Iteration %d of %d, input path: %s", iterationNumber, maxIterations,
            modelInput);/* w w  w.ja  v a  2s.com*/
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setJarByClass(CVB0Driver.class);
    job.setMapperClass(CachingCVB0Mapper.class);
    job.setCombinerClass(VectorSumReducer.class);
    job.setReducerClass(VectorSumReducer.class);
    job.setNumReduceTasks(numReduceTasks);
    job.setOutputKeyClass(Text.class);//0.7IntWritable
    job.setOutputValueClass(VectorWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.addInputPath(job, corpusInput);
    FileOutputFormat.setOutputPath(job, modelOutput);
    setModelPaths(job, modelInput);
    HadoopUtil.delete(conf, modelOutput);
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException(
                String.format("Failed to complete iteration %d stage 1", iterationNumber));
    }
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format//from  www. j a  va2s. c o  m
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format/* w  w  w.java  2  s .  c om*/
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format//from   w w w. j  a  va2  s  . c om
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format//from w  w w  . j  a v a 2  s  .co  m
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}