List of usage examples for org.apache.hadoop.mapreduce Job setOutputValueClass
public void setOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.github.milind.NumberAdditionPerLine.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Addition of Numbers Per Line"); job.setJarByClass(NumberAdditionPerLine.class); job.setMapperClass(NumberAdditionPerLineMapper.class); job.setNumReduceTasks(0);//from w w w .j a va 2s.c o m job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.github.sakserv.minicluster.mapreduce.Driver.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage: [input] [output]"); System.exit(-1);// ww w .j a v a 2s .c om } if (null == configuration) { configuration = new Configuration(); } Job job = Job.getInstance(configuration); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordMapper.class); job.setReducerClass(SumReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(Driver.class); job.waitForCompletion(true); }
From source file:com.github.sample.mapreduce.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); System.out.println("coder"); //conf.addResource("etc/hadoop/hadoop-local.xml"); //conf.setBoolean("mapreduce.output.fileoutputformat.compress", true); //conf.setClass("mapreduce.output.fileoutputformat.compress.codec", GzipCodec.class, CompressionCodec.class); conf.set("fs.default.name", "hdfs://localhost:9000"); /*// ww w .java 2s.c om conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName() ); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName() ); */ String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2); } Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.github.sandgorgon.parmr.Main.java
License:Open Source License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: parmr <input file> <output path>"); return -1; }//from www . j av a 2s . co m Configuration conf = super.getConf(); conf.set("mapreduce.job.queuename", "prod"); Job job = Job.getInstance(conf); job.setJobName(jobName); job.setJarByClass(Main.class); // Parquet Schema // Read from the input file itself the schema that we will be assuming Path infile = new Path(args[0]); List<Footer> footers = ParquetFileReader.readFooters(conf, infile.getFileSystem(conf).getFileStatus(infile), true); MessageType schema = footers.get(0).getParquetMetadata().getFileMetaData().getSchema(); // Avro Schema // Convert the Parquet schema to an Avro schema AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(); Schema avroSchema = avroSchemaConverter.convert(schema); // Set the Mapper job.setMapperClass(UserMapper.class); // This works for predicate pushdown on record assembly read. AvroParquetInputFormat.setUnboundRecordFilter(job, UserRecordFilter.class); AvroParquetInputFormat.addInputPath(job, new Path(args[0])); AvroParquetInputFormat.setAvroReadSchema(job, avroSchema); job.setInputFormatClass(AvroParquetInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // If you needed to return an avro object from the mapper, refer to this... //job.setMapOutputValueClass(AvroValue.class); //AvroJob.setMapOutputValueSchema(job, avroSchema); // Reducer job.setReducerClass(UserReducer.class); // Output job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); // If we need to return an avro class again, refer to this... //job.setOutputFormatClass(AvroParquetOutputFormat.class); //AvroParquetOutputFormat.setOutputPath(job, new Path(args[1])); //AvroParquetOutputFormat.setSchema(job, avroSchema); //job.setOutputKeyClass(Void.class); //job.setOutputValueClass(GenericRecord.class); // Rough way of testing the projection side of things. AvroParquetInputFormat.setRequestedProjection(job, Schema.parse("{\"namespace\": \"com.github.sandgorgon.parmr.avro\",\n" + " \"type\": \"record\",\n" + " \"name\": \"User\",\n" + " \"fields\": [\n" + " {\"name\": \"name\", \"type\": \"string\"},\n" + " {\"name\": \"favorite_number\", \"type\": [\"int\", \"null\"]}\n" + // " {\"name\": \"favorite_color\", \"type\": [\"string\", \"null\"]}\n" + " ]\n" + "}\n" + "")); // Do the deed! int completion = job.waitForCompletion(true) ? 0 : 1; return completion; }
From source file:com.github.ygf.pagerank.InLinks.java
License:Apache License
private void computeInLinks(Configuration conf, Path linksFile, Path outputDir) throws Exception { // This job computes the number of in-links for every page. The // implementation is very similar to the classic word count example. Job job = Job.getInstance(conf, "InLinks:Computation"); job.setJarByClass(InLinks.class); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(InLinksMapper.class); job.setCombinerClass(InLinksReducer.class); job.setReducerClass(InLinksReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, linksFile); FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks")); job.waitForCompletion(true);/*from w w w .j a v a2 s. co m*/ }
From source file:com.github.ygf.pagerank.InLinks.java
License:Apache License
private void summarizeResults(Configuration conf, Path outputDir) throws Exception { int topResults = Integer.parseInt(conf.get("inlinks.top_results")); Job job = Job.getInstance(conf, "InLinks:TopN"); job.setJarByClass(InLinks.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(InLinksTopNMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(InLinksTopNReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "inlinks")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "inlinks-top" + topResults)); job.setNumReduceTasks(1);//from ww w . j a v a 2 s.c o m job.waitForCompletion(true); }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void createTransitionMatrix(Configuration conf, Path linksFile, Path outputDir) throws Exception { // This job reads the links-simple-sorted.txt input file and generates // the corresponding transition matrix. The matrix is divided into // square blocks and each block is represented by the nonzero entries. // See Section 5.2 (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. // The output is written to the "M" subdir in the output dir. Job job = Job.getInstance(conf, "PageRank:Matrix"); job.setJarByClass(PageRank.class); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(PageRankMatrixMapper.class); job.getConfiguration().setBoolean("mapreduce.map.output.compress", true); job.getConfiguration().setClass("mapreduce.map.output.compress.codec", DefaultCodec.class, CompressionCodec.class); job.setMapOutputKeyClass(ShortArrayWritable.class); job.setMapOutputValueClass(ShortArrayWritable.class); job.setReducerClass(PageRankMatrixReducer.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ShortArrayWritable.class); job.setOutputValueClass(MatrixBlockWritable.class); FileInputFormat.addInputPath(job, linksFile); FileOutputFormat.setOutputPath(job, new Path(outputDir, "M")); job.waitForCompletion(true);//from w ww. j a v a 2 s.com }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception { // This job performs an iteration of the power iteration method to // compute PageRank. The map task processes each block M_{i,j}, loads // the corresponding stripe j of the vector v_{k-1} and produces the // partial result of the stripe i of the vector v_k. The reduce task // sums all the partial results of v_k and adds the teleportation factor // (the combiner only sums all the partial results). See Section 5.2 // (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The // output is written in a "vk" subdir of the output dir, where k is the // iteration number. MapFileOutputFormat is used to keep an array of the // stripes of v. Job job = Job.getInstance(conf, "PageRank:Iteration"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankIterationMapper.class); job.setMapOutputKeyClass(ShortWritable.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setCombinerClass(PageRankIterationCombiner.class); job.setReducerClass(PageRankIterationReducer.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(ShortWritable.class); job.setOutputValueClass(FloatArrayWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "M")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter)); job.waitForCompletion(true);/*from w ww . j a v a2 s . c o m*/ }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void summarizeResults(int iter, Configuration conf, Path outputDir) throws Exception { // This job creates a plain text file with the top N PageRanks and the // titles of the pages. Each map task emits the top N PageRanks it // receives, and the reduce task merges the partial results into the // global top N PageRanks. A single reducer is used in the job in order // to have access to all the individual top N PageRanks from the // mappers. The reducer looks up the titles in the index built by // TitleIndex. This job was designed considering that N is small. int topResults = Integer.parseInt(conf.get("pagerank.top_results")); Job job = Job.getInstance(conf, "PageRank:TopN"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankTopNMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(PageRankTopNReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(outputDir, "v" + iter)); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter + "-top" + topResults)); job.setNumReduceTasks(1);//from w ww . ja va2 s . c om job.waitForCompletion(true); }
From source file:com.github.ygf.pagerank.TitleIndex.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.out.println("Usage: TitleIndex <titles-sorted.txt> <output-dir>"); ToolRunner.printGenericCommandUsage(System.out); return 2; }// w ww. ja va 2s. co m Path titlesFile = new Path(args[0]); Path outputDir = new Path(args[1]); Configuration conf = getConf(); // Do not create _SUCCESS files. MapFileOutputFormat.getReaders calls // try to read the _SUCCESS as another MapFile dir. conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false"); // This job creates a MapFile of the titles indexed by the page id. // UnsplittableTextInputFormat is used to ensure that the same map task // gets all the lines in the titlesFile and it can count the line // numbers. The number of reduce tasks is set to 0. Job job = Job.getInstance(conf, "TitleIndex"); job.setJarByClass(InLinks.class); job.setInputFormatClass(UnsplittableTextInputFormat.class); job.setMapperClass(TitleIndexMapper.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, titlesFile); FileOutputFormat.setOutputPath(job, outputDir); job.setNumReduceTasks(0); job.waitForCompletion(true); return 0; }