List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:com.neu.cs6240.Xml2csvComments.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); // Setting up the xml tag configurator for splitter conf.set("xmlinput.start", "<row "); conf.set("xmlinput.end", " />"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: Xml2csvPosts <in> <out>"); System.exit(2);/*w w w . j a v a2s. c om*/ } Job job = new Job(conf, "Converts Posts.xml to .csv"); job.setJarByClass(Xml2csvPosts.class); job.setInputFormatClass(XmlInputFormat.class); job.setMapperClass(CommentsMapper.class); job.setReducerClass(CommentsReducer.class); job.setPartitionerClass(PostsPartitioner.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); // Set as per your file size job.setNumReduceTasks(10); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.neu.cs6240.Xml2csvPosts.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); // Setting up the xml tag configurator for splitter conf.set("xmlinput.start", "<row "); conf.set("xmlinput.end", " />"); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: Xml2csvPosts <in> <out>"); System.exit(2);// www. j av a 2s .c o m } Job job = new Job(conf, "Converts Posts.xml to .csv"); job.setJarByClass(Xml2csvPosts.class); job.setInputFormatClass(XmlInputFormat.class); job.setMapperClass(PostsMapper.class); job.setReducerClass(PostsReducer.class); job.setPartitionerClass(PostsPartitioner.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); // Set as per your file size job.setNumReduceTasks(15); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.phantom.hadoop.examples.SecondarySort.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysort <in> <out>"); System.exit(2);/*from w w w . j a v a 2 s . co m*/ } Job job = new Job(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.phantom.hadoop.examples.terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); Job job = Job.getInstance(getConf()); Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); boolean useSimplePartitioner = getUseSimplePartitioner(job); TeraInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TeraInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else {/*from w w w. jav a2s . c o m*/ long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { LOG.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); TeraOutputFormat.setFinalSync(job, true); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
From source file:com.placeiq.piqconnect.Runner.java
License:Apache License
private Job buildJob1(Path input1, Path input2, Path output) throws Exception { Configuration conf = getConf(); conf.setInt(Constants.PROP_BLOCK_SIZE, blockSize); conf.set("mapred.output.compression.type", "BLOCK"); Job job = new Job(conf, "data-piqid.piqconnect.IterationStage1"); job.setJarByClass(Runner.class); job.setMapperClass(IterationStage1._Mapper.class); job.setReducerClass(IterationStage1._Reducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numberOfReducers); job.setMapOutputKeyClass(IterationStage1.JoinKey.class); job.setMapOutputValueClass(BlockWritable.class); job.setOutputKeyClass(VLongWritable.class); job.setOutputValueClass(BlockWritable.class); job.setGroupingComparatorClass(IterationStage1.IndexComparator.class); job.setPartitionerClass(IterationStage1.IndexPartitioner.class); job.setSortComparatorClass(IterationStage1.SortComparator.class); FileInputFormat.setInputPaths(job, input1, input2); SequenceFileOutputFormat.setOutputPath(job, output); SequenceFileOutputFormat.setCompressOutput(job, true); setCompression(job);/* w w w .j a va2 s . c om*/ return job; }
From source file:com.rockstor.compact.GenGarbageIndexTool.java
License:Apache License
private Job createSubmittableJob(Configuration conf) throws IOException { Job job = new Job(conf, NAME); job.setJarByClass(GenGarbageIndexTool.class); Scan scan = new Scan(); TableMapReduceUtil.initTableMapperJob(GarbageChunkDB.TAB_NAME, scan, GarbageChunkMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); TableMapReduceUtil.setScannerCaching(job, batchSize); job.setReducerClass(GarbageChunkReduce.class); job.setPartitionerClass(GarbageChunkPartition.class); job.setCombinerClass(GarbageChunkCombine.class); job.setNumReduceTasks(Compactor.getInstance().getReduceNum()); job.setOutputFormatClass(NullOutputFormat.class); LOG.info("init job " + NAME + " finished!"); return job;//www. j av a 2 s. com }
From source file:com.sanjay.mapreduce.SiCombiner.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2);//from ww w . j a va2 s .c o m } Job job = new Job(conf, "word count"); job.setJarByClass(SiCombiner.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setPartitionerClass(WordPartitioner.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(5); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.sirius.hadoop.job.onlinetime.OnlineTimeJob.java
License:Apache License
public Job build() throws Exception { //init//w w w . j av a 2 s. c o m Job job = Job.getInstance(getConf(), "onlinetime"); job.setJarByClass(OnlineTimeJob.class); //mapp job.setMapperClass(StatusMapper.class); job.setMapOutputKeyClass(StatusKey.class); job.setMapOutputValueClass(OnlineRecord.class); //custom partition job.setPartitionerClass(StatusKeyPartitioner.class); //reduce job.setGroupingComparatorClass(StatusKeyGroupComparator.class); job.setReducerClass(StatusReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //input FileInputFormat.setInputPaths(job, new Path("/subscriber_status/subscriber_status.json")); //output FileOutputFormat.setOutputPath(job, out); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, Lz4Codec.class); return job; }
From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java
License:Apache License
/** * Use this before submitting a TableReduce job. It will * appropriately set up the JobConf.// ww w . j a va2 s. c o m * * @param table The output Splice table name, The format should be Schema.tableName. * @param reducer The reducer class to use. * @param job The current job to adjust. Make sure the passed job is * carrying all necessary configuration. * @param partitioner Partitioner to use. Pass <code>null</code> to use * default partitioner. * @param quorumAddress Distant cluster to write to; default is null for * output to the cluster that is designated in <code>hbase-site.xml</code>. * Set this String to the zookeeper ensemble of an alternate remote cluster * when you would have the reduce write a cluster that is other than the * default; e.g. copying tables between clusters, the source would be * designated by <code>hbase-site.xml</code> and this param would have the * ensemble address of the remote cluster. The format to pass is particular. * Pass <code> <hbase.zookeeper.quorum>:<hbase.zookeeper.client.port>:<zookeeper.znode.parent> * </code> such as <code>server,server2,server3:2181:/hbase</code>. * @param serverClass redefined hbase.regionserver.class * @param serverImpl redefined hbase.regionserver.client * @param addDependencyJars upload HBase jars and jars for any of the configured * job classes via the distributed cache (tmpjars). * @throws IOException When determining the region count fails. * @throws SQLException */ public static void initTableReducerJob(String table, Class<? extends Reducer> reducer, Job job, Class partitioner, String quorumAddress, String serverClass, String serverImpl, boolean addDependencyJars, Class<? extends OutputFormat> outputformatClass) throws IOException { Configuration conf = job.getConfiguration(); job.setOutputFormatClass(outputformatClass); if (reducer != null) job.setReducerClass(reducer); conf.set(MRConstants.SPLICE_OUTPUT_TABLE_NAME, table); if (sqlUtil == null) sqlUtil = SMSQLUtil.getInstance(conf.get(MRConstants.SPLICE_JDBC_STR)); // If passed a quorum/ensemble address, pass it on to TableOutputFormat. String hbaseTableID = null; try { hbaseTableID = sqlUtil.getConglomID(table); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); throw new IOException(e); } conf.set(MRConstants.HBASE_OUTPUT_TABLE_NAME, table); if (quorumAddress != null) { // Calling this will validate the format HBasePlatformUtils.validateClusterKey(quorumAddress); conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress); } if (serverClass != null && serverImpl != null) { conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass); conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl); } job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Object.class); if (partitioner == HRegionPartitioner.class) { job.setPartitionerClass(HRegionPartitioner.class); // TODO Where are the keys? int regions = getReduceNumberOfRegions(hbaseTableID); if (job.getNumReduceTasks() > regions) { job.setNumReduceTasks(regions); } } else if (partitioner != null) { job.setPartitionerClass(partitioner); } if (addDependencyJars) { addDependencyJars(job); } //initCredentials(job); }
From source file:com.sreejith.loganalyzer.mapreduce.LogDriver.java
License:Apache License
public static void main(String[] args) throws Exception { Job job = new Job(); job.setJarByClass(LogDriver.class); job.setJobName("Log Analyzer"); job.setMapperClass(LogMapper.class); job.setPartitionerClass(LogPartitioner.class); job.setCombinerClass(LogReducer.class); job.setReducerClass(LogReducer.class); job.setNumReduceTasks(2);/* ww w . j a v a2s . co m*/ job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }