List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void createTransitionMatrix(Configuration conf, Path linksFile, Path outputDir) throws Exception { // This job reads the links-simple-sorted.txt input file and generates // the corresponding transition matrix. The matrix is divided into // square blocks and each block is represented by the nonzero entries. // See Section 5.2 (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. // The output is written to the "M" subdir in the output dir. Job job = Job.getInstance(conf, "PageRank:Matrix"); job.setJarByClass(PageRank.class); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(PageRankMatrixMapper.class); job.getConfiguration().setBoolean("mapreduce.map.output.compress", true); job.getConfiguration().setClass("mapreduce.map.output.compress.codec", DefaultCodec.class, CompressionCodec.class); job.setMapOutputKeyClass(ShortArrayWritable.class); job.setMapOutputValueClass(ShortArrayWritable.class); job.setReducerClass(PageRankMatrixReducer.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ShortArrayWritable.class); job.setOutputValueClass(MatrixBlockWritable.class); FileInputFormat.addInputPath(job, linksFile); FileOutputFormat.setOutputPath(job, new Path(outputDir, "M")); job.waitForCompletion(true);/*from ww w . j a v a2 s.c om*/ }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception { // This job performs an iteration of the power iteration method to // compute PageRank. The map task processes each block M_{i,j}, loads // the corresponding stripe j of the vector v_{k-1} and produces the // partial result of the stripe i of the vector v_k. The reduce task // sums all the partial results of v_k and adds the teleportation factor // (the combiner only sums all the partial results). See Section 5.2 // (and 5.2.3 in particular) of Mining of Massive Datasets // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The // output is written in a "vk" subdir of the output dir, where k is the // iteration number. MapFileOutputFormat is used to keep an array of the // stripes of v. Job job = Job.getInstance(conf, "PageRank:Iteration"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankIterationMapper.class); job.setMapOutputKeyClass(ShortWritable.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setCombinerClass(PageRankIterationCombiner.class); job.setReducerClass(PageRankIterationReducer.class); job.setOutputFormatClass(MapFileOutputFormat.class); job.setOutputKeyClass(ShortWritable.class); job.setOutputValueClass(FloatArrayWritable.class); FileInputFormat.addInputPath(job, new Path(outputDir, "M")); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter)); job.waitForCompletion(true);/*w w w . j a v a2 s . co m*/ }
From source file:com.github.ygf.pagerank.PageRank.java
License:Apache License
private void summarizeResults(int iter, Configuration conf, Path outputDir) throws Exception { // This job creates a plain text file with the top N PageRanks and the // titles of the pages. Each map task emits the top N PageRanks it // receives, and the reduce task merges the partial results into the // global top N PageRanks. A single reducer is used in the job in order // to have access to all the individual top N PageRanks from the // mappers. The reducer looks up the titles in the index built by // TitleIndex. This job was designed considering that N is small. int topResults = Integer.parseInt(conf.get("pagerank.top_results")); Job job = Job.getInstance(conf, "PageRank:TopN"); job.setJarByClass(PageRank.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(PageRankTopNMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(PageRankTopNReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(outputDir, "v" + iter)); FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter + "-top" + topResults)); job.setNumReduceTasks(1);//from www. j a v a 2 s.c om job.waitForCompletion(true); }
From source file:com.google.cloud.bigtable.mapreduce.Import.java
License:Open Source License
/** * Sets up the actual job./*ww w. j a v a 2s .c o m*/ * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { TableName tableName = TableName.valueOf(args[0]); conf.set(TABLE_NAME, tableName.getNameAsString()); Path inputDir = new Path(args[1]); Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName)); job.setJarByClass(Importer.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(SequenceFileInputFormat.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); // make sure we get the filter in the jars try { Class<? extends Filter> filter = conf.getClass(FILTER_CLASS_CONF_KEY, null, Filter.class); if (filter != null) { TableMapReduceUtil.addDependencyJars(conf, filter); } } catch (Exception e) { throw new IOException(e); } if (hfileOutPath != null) { job.setMapperClass(KeyValueImporter.class); try (Connection conn = ConnectionFactory.createConnection(conf); Table table = conn.getTable(tableName); RegionLocator regionLocator = conn.getRegionLocator(tableName)) { job.setReducerClass(KeyValueSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), com.google.common.base.Preconditions.class); } } else { // No reducers. Just write straight to table. Call initTableReducerJob // because it sets up the TableOutputFormat. job.setMapperClass(Importer.class); TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job); job.setNumReduceTasks(0); } return job; }
From source file:com.gsinnovations.howdah.AbstractJob.java
License:Apache License
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = new Job(new Configuration(getConf())); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }//w ww. j a v a 2s . com job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setJobName(getCustomJobName(job, mapper, reducer)); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:com.gsinnovations.howdah.Driver.java
License:Apache License
public static void job(Path input, Path output, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(TikaMapper.class); //job.setCombinerClass(KMeansCombiner.class); //job.setReducerClass(KMeansReducer.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(Driver.class); HadoopUtil.overwriteOutput(output);//from w w w .j a va2 s. c o m job.waitForCompletion(true); }
From source file:com.gsvic.csmr.CSMRBase.java
License:Apache License
public static void generatePairs(String in, String out) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); path = out;/*ww w . ja va2 s.co m*/ Job job; Path input, output; input = new Path(in); output = new Path(path + "/CSMRPairs"); job = new Job(conf); job.setJobName("CSMR Pairs Job"); job.setJarByClass(CSMRBase.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(CSMRMapper.class); job.setReducerClass(CSMRReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DocumentWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorArrayWritable.class); job.waitForCompletion(true); }
From source file:com.gsvic.csmr.CSMRBase.java
License:Apache License
public static void StartCSMR() throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job; job = new Job(conf); job.setJobName("CSMR Cosine Similarity Job"); job.setJarByClass(CSMRBase.class); FileInputFormat.addInputPath(job, new Path(path + "/CSMRPairs/part-r-00000")); FileOutputFormat.setOutputPath(job, new Path(path + "/Results")); job.setMapperClass(Mapper.class); job.setReducerClass(CosineSimilarityReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorArrayWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true) ? 1 : 0); }
From source file:com.hadoop.examples.secondSort.SecondarySort.java
License:Apache License
public static void main(String[] args) throws Exception { // ?hadoop?/* w w w . jav a2s. c o m*/ Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysort <in> <out>"); System.exit(2); } // ? Job job = new Job(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); // Mapper job.setMapperClass(MapClass.class); // ???CombinerCombiner<Text, IntWritable>Reduce<IntPair, IntWritable>? //job.setCombinerClass(Reduce.class); // Reducer job.setReducerClass(Reduce.class); // * // *group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); //setSortComparatorClass()hadoopkey?(?2.Hadoopkey?) //IntPair?compareTo() //job.setSortComparatorClass(cls); // * job.setGroupingComparatorClass(FirstGroupingComparator.class); // map Key // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); // mapValue job.setMapOutputValueClass(IntWritable.class); // rduceKeyTextOutputFormatClassTextOutputFormat // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); // rduceValue job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); // ??job System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.hadoop.secondarysort.SecondarySortDESC.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // if (otherArgs.length != 2) { // System.err.println("Usage: secondarysrot <in> <out>"); // System.exit(2); // }/*from w ww .j a va2 s. c om*/ // JobConf jobConf = new JobConf(); Job job = new Job(conf, "secondary sort"); job.setJarByClass(SecondarySortDESC.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // conf.setClass("mapred.output.key.comparator.class", // KeyComparator.class, RawComparator.class); // job.setSortComparatorClass(SecondGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(inPath)); FileOutputFormat.setOutputPath(job, new Path(outPath)); FileSystem fileSystem = FileSystem.get(conf); if (fileSystem.exists(new Path(outPath))) { fileSystem.delete(new Path(outPath)); } System.exit(job.waitForCompletion(true) ? 0 : 1); }