Example usage for org.apache.hadoop.mapreduce Job setOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputValueClass.

Prototype

public void setOutputValueClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the value class for job outputs.

Usage

From source file:com.github.milind.NumberAdditionPerLine.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Addition of Numbers Per Line");
    job.setJarByClass(NumberAdditionPerLine.class);
    job.setMapperClass(NumberAdditionPerLineMapper.class);
    job.setNumReduceTasks(0);//from   w  w w  .j  a va  2s.c  o m
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.github.sakserv.minicluster.mapreduce.Driver.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 2) {
        System.out.println("usage: [input] [output]");
        System.exit(-1);// ww  w .j a  v  a  2s  .c  om
    }

    if (null == configuration) {
        configuration = new Configuration();
    }

    Job job = Job.getInstance(configuration);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordMapper.class);
    job.setReducerClass(SumReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setJarByClass(Driver.class);

    job.waitForCompletion(true);

}

From source file:com.github.sample.mapreduce.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    System.out.println("coder");
    //conf.addResource("etc/hadoop/hadoop-local.xml");
    //conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    //conf.setClass("mapreduce.output.fileoutputformat.compress.codec", GzipCodec.class, CompressionCodec.class);
    conf.set("fs.default.name", "hdfs://localhost:9000");
    /*// ww w .java 2s.c om
      conf.set("fs.hdfs.impl",
    org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
    );
    conf.set("fs.file.impl",
    org.apache.hadoop.fs.LocalFileSystem.class.getName()
    );
    */
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);
    }

    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.github.sandgorgon.parmr.Main.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: parmr <input file> <output path>");
        return -1;
    }//from   www  . j av a 2s  .  co m

    Configuration conf = super.getConf();
    conf.set("mapreduce.job.queuename", "prod");

    Job job = Job.getInstance(conf);
    job.setJobName(jobName);
    job.setJarByClass(Main.class);

    // Parquet Schema
    // Read from the input file itself the schema that we will be assuming
    Path infile = new Path(args[0]);
    List<Footer> footers = ParquetFileReader.readFooters(conf, infile.getFileSystem(conf).getFileStatus(infile),
            true);
    MessageType schema = footers.get(0).getParquetMetadata().getFileMetaData().getSchema();

    // Avro Schema
    // Convert the Parquet schema to an Avro schema
    AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter();
    Schema avroSchema = avroSchemaConverter.convert(schema);

    // Set the Mapper
    job.setMapperClass(UserMapper.class);

    // This works for predicate pushdown on record assembly read.
    AvroParquetInputFormat.setUnboundRecordFilter(job, UserRecordFilter.class);

    AvroParquetInputFormat.addInputPath(job, new Path(args[0]));
    AvroParquetInputFormat.setAvroReadSchema(job, avroSchema);
    job.setInputFormatClass(AvroParquetInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    // If you needed to return an avro object from the mapper, refer to this...
    //job.setMapOutputValueClass(AvroValue.class);
    //AvroJob.setMapOutputValueSchema(job, avroSchema);

    // Reducer
    job.setReducerClass(UserReducer.class);

    // Output
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    // If we need to return an avro class again, refer to this...
    //job.setOutputFormatClass(AvroParquetOutputFormat.class);
    //AvroParquetOutputFormat.setOutputPath(job, new Path(args[1]));
    //AvroParquetOutputFormat.setSchema(job, avroSchema);
    //job.setOutputKeyClass(Void.class);
    //job.setOutputValueClass(GenericRecord.class);

    // Rough way of testing the projection side of things.
    AvroParquetInputFormat.setRequestedProjection(job,
            Schema.parse("{\"namespace\": \"com.github.sandgorgon.parmr.avro\",\n" + " \"type\": \"record\",\n"
                    + " \"name\": \"User\",\n" + " \"fields\": [\n"
                    + "     {\"name\": \"name\", \"type\": \"string\"},\n"
                    + "     {\"name\": \"favorite_number\",  \"type\": [\"int\", \"null\"]}\n" +
                    //                "     {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n" +
                    " ]\n" + "}\n" + ""));

    // Do the deed!
    int completion = job.waitForCompletion(true) ? 0 : 1;

    return completion;
}

From source file:com.github.ygf.pagerank.InLinks.java

License:Apache License

private void computeInLinks(Configuration conf, Path linksFile, Path outputDir) throws Exception {

    // This job computes the number of in-links for every page. The
    // implementation is very similar to the classic word count example.

    Job job = Job.getInstance(conf, "InLinks:Computation");

    job.setJarByClass(InLinks.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(InLinksMapper.class);
    job.setCombinerClass(InLinksReducer.class);
    job.setReducerClass(InLinksReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, linksFile);
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks"));

    job.waitForCompletion(true);/*from w w w  .j  a  v  a2  s.  co  m*/
}

From source file:com.github.ygf.pagerank.InLinks.java

License:Apache License

private void summarizeResults(Configuration conf, Path outputDir) throws Exception {

    int topResults = Integer.parseInt(conf.get("inlinks.top_results"));

    Job job = Job.getInstance(conf, "InLinks:TopN");

    job.setJarByClass(InLinks.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(InLinksTopNMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(InLinksTopNReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "inlinks"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks-top" + topResults));

    job.setNumReduceTasks(1);//from ww w  . j  a v  a 2 s.c  o m
    job.waitForCompletion(true);
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void createTransitionMatrix(Configuration conf, Path linksFile, Path outputDir) throws Exception {

    // This job reads the links-simple-sorted.txt input file and generates
    // the corresponding transition matrix. The matrix is divided into
    // square blocks and each block is represented by the nonzero entries.
    // See Section 5.2 (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details.
    // The output is written to the "M" subdir in the output dir.

    Job job = Job.getInstance(conf, "PageRank:Matrix");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(PageRankMatrixMapper.class);
    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapreduce.map.output.compress.codec", DefaultCodec.class,
            CompressionCodec.class);
    job.setMapOutputKeyClass(ShortArrayWritable.class);
    job.setMapOutputValueClass(ShortArrayWritable.class);
    job.setReducerClass(PageRankMatrixReducer.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(ShortArrayWritable.class);
    job.setOutputValueClass(MatrixBlockWritable.class);
    FileInputFormat.addInputPath(job, linksFile);
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "M"));

    job.waitForCompletion(true);//from w ww.  j a  v  a 2 s.com
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads 
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);/*from w  ww .  j a  v  a2  s . c o m*/
}

From source file:com.github.ygf.pagerank.PageRank.java

License:Apache License

private void summarizeResults(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job creates a plain text file with the top N PageRanks and the
    // titles of the pages. Each map task emits the top N PageRanks it
    // receives, and the reduce task merges the partial results into the
    // global top N PageRanks. A single reducer is used in the job in order
    // to have access to all the individual top N PageRanks from the
    // mappers. The reducer looks up the titles in the index built by
    // TitleIndex. This job was designed considering that N is small.

    int topResults = Integer.parseInt(conf.get("pagerank.top_results"));

    Job job = Job.getInstance(conf, "PageRank:TopN");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankTopNMapper.class);
    job.setMapOutputKeyClass(FloatWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(PageRankTopNReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "v" + iter));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter + "-top" + topResults));

    job.setNumReduceTasks(1);//from  w  ww  . ja va2 s .  c  om
    job.waitForCompletion(true);
}

From source file:com.github.ygf.pagerank.TitleIndex.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }// w ww.  ja va 2s.  co  m

    Path titlesFile = new Path(args[0]);
    Path outputDir = new Path(args[1]);

    Configuration conf = getConf();

    // Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls
    // try to read the _SUCCESS as another MapFile dir.
    conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false");

    // This job creates a MapFile of the titles indexed by the page id.
    // UnsplittableTextInputFormat is used to ensure that the same map task
    // gets all the lines in the titlesFile and it can count the line
    // numbers. The number of reduce tasks is set to 0.

    Job job = Job.getInstance(conf, "TitleIndex");

    job.setJarByClass(InLinks.class);
    job.setInputFormatClass(UnsplittableTextInputFormat.class);
    job.setMapperClass(TitleIndexMapper.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, titlesFile);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.setNumReduceTasks(0);
    job.waitForCompletion(true);

    return 0;
}