List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:com.marklogic.mapreduce.examples.LinkCount.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: LinkCount configFile outputDir"); System.exit(2);//from w ww .j a va2 s .c o m } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count"); job.setJarByClass(LinkCount.class); job.setInputFormatClass(ValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.LinkCountCooccurrences.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: LinkCountCooccurrences configFile outputDir"); System.exit(2);//w ww . j a v a 2 s . c om } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count cooccurrences"); job.setJarByClass(LinkCountCooccurrences.class); job.setInputFormatClass(KeyValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_KEY_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); conf.setClass(MarkLogicConstants.INPUT_LEXICON_FUNCTION_CLASS, HrefTitleMap.class, ElemAttrValueCooccurrences.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.marklogic.mapreduce.examples.LinkCountValue.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); if (args.length < 2) { System.err.println("Usage: LinkCountValue configFile outputDir"); System.exit(2);//w w w .ja va 2s . co m } String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Job job = Job.getInstance(conf, "link count value"); job.setJarByClass(LinkCountValue.class); job.setInputFormatClass(ValueInputFormat.class); job.setMapperClass(RefMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); conf = job.getConfiguration(); conf.addResource(otherArgs[0]); conf.setClass(MarkLogicConstants.INPUT_VALUE_CLASS, Text.class, Writable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.mb.saas.bi.job.WordCountJob.java
License:Apache License
public static boolean runHadoopMapReduceJob() throws Exception { System.setProperty("HADOOP_USER_NAME", "hadoop"); File jarFile = UploadResource.createTempJar("bin"); ClassLoader classLoader = UploadResource.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://mbcluster/"); conf.set("dfs.nameservices", "mbcluster"); conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2"); conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001"); conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001"); conf.set("dfs.client.failover.proxy.provider.mbcluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); boolean isMapReduceJarSetted = false; String hadoopMapReduceJar = "F:/henry_projects/mbHiveAnalyzer/t.jar"; File file = new File(hadoopMapReduceJar); if (file.exists()) { ((JobConf) job.getConfiguration()).setJar(hadoopMapReduceJar); isMapReduceJarSetted = true;/*from w w w . jav a2s. c om*/ } if (!isMapReduceJarSetted && jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path("/input/wordcount.txt")); FileOutputFormat.setOutputPath(job, new Path("/output/001")); System.exit(job.waitForCompletion(true) ? 0 : 1); return true; }
From source file:com.mb.saas.bi.job.WordCountJob.java
License:Apache License
public static void main(String[] args) throws Exception { System.setProperty("HADOOP_USER_NAME", "hadoop"); File jarFile = UploadResource.createTempJar("bin"); System.setProperty("hadoop.home.dir", "F:/hadoop"); ClassLoader classLoader = UploadResource.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); Configuration conf = new Configuration(); // conf.set("fs.defaultFS", "hdfs://slave1:4001"); // conf.set("mapreduce.framework.name", "yarn"); // conf.set("yarn.resourcemanager.address", "master:8032"); // conf.set("yarn.resourcemanager.scheduler.address", "master:8030"); conf.set("fs.defaultFS", "hdfs://mbcluster/"); conf.set("dfs.nameservices", "mbcluster"); conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2"); conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001"); conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001"); conf.set("dfs.client.failover.proxy.provider.mbcluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); conf.set("mapred.remote.os", "Linux"); System.out.println(conf.get("mapred.remote.os")); // conf.set("mapreduce.job.reduces", "2"); // conf.set("mapreduce.tasktracker.map.tasks.maximum", "8"); // conf.set("mapreduce.input.fileinputformat.split.maxsize","123"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); // job.setMaxMapAttempts(2); job.setNumReduceTasks(1);//from w w w .j a va 2s . c o m FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt")); // FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt")); FileOutputFormat.setOutputPath(job, new Path("/output/001002")); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java
License:Open Source License
public boolean run() { try {/*ww w .j a v a 2 s .c o m*/ /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = new Job(new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); injectSystemProperties(groupByJob); groupByJob.setInputFormatClass(TextInputFormat.class); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); groupByJob.setJarByClass(DeterminePartitionsJob.class); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = new Job(new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); injectSystemProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); dimSelectionJob.setInputFormatClass(TextInputFormat.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setJarByClass(DeterminePartitionsJob.class); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals()) { DateTime bucket = segmentGranularity.getStart(); final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0)); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (fileSystem.exists(partitionInfoPath)) { List<ShardSpec> specs = config.jsonMapper.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() { }); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i)); } shardSpecs.put(bucket, actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduce.java
public static Job RunJobAsync(Path inputPath, Path outputPath, Configuration conf) throws Exception { Job job = Job.getInstance(conf, "word count"); job.setJarByClass(HelloMapReduce.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return job;/*ww w. jav a 2s . co m*/ }
From source file:com.michaeljones.hellohadoopworldmaven.HelloMapReduce.java
public static Job RunJobAnalysisAsync(Path inputPath, Path outputPath, Configuration conf) throws Exception { Job job = Job.getInstance(conf, "word count"); job.setJarByClass(HelloMapReduce.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumCombinerAnalyser.class); job.setReducerClass(IntSumReducerAnalyser.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return job;//from ww w. j a v a2 s . co m }
From source file:com.mongodb.hadoop.examples.snmp.SnmpStatisticWithCombiner.java
License:Apache License
@Override public int run(String[] args) throws Exception { final Configuration conf = getConf(); final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf); if (outputUri == null) throw new IllegalStateException("output uri is not set"); if (MongoConfigUtil.getInputURI(conf) == null) throw new IllegalStateException("input uri is not set"); final String outputCollectionName = outputUri.getCollection(); final Job job = new Job(conf, "snmp analysis " + outputCollectionName); job.setJarByClass(SnmpStatisticWithCombiner.class); job.setMapperClass(MapHostUploadOnEachAPPerDay.class); job.setCombinerClass(CombineHostUploadOnEachAPPerDay.class); job.setReducerClass(ReduceHostUploadOnEachAPPerDay.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); try {//from w w w . j ava 2s . c om boolean result = job.waitForCompletion(true); System.out.println("job.waitForCompletion( true ) returned " + result); } catch (Exception e) { System.out.println("job.waitForCompletion( true ) threw Exception"); e.printStackTrace(); } return 0; }
From source file:com.mongodb.hadoop.examples.snmp.SnmpStatisticWithCombiner.java
License:Apache License
public static void main(String[] args) throws Exception { boolean use_shards = true; boolean use_chunks = false; final Configuration Conf = new Configuration(); MongoConfigUtil.setInputURI(Conf, "mongodb://localhost:30000/test.snmp"); Conf.setBoolean(MongoConfigUtil.SPLITS_USE_SHARDS, use_shards); Conf.setBoolean(MongoConfigUtil.SPLITS_USE_CHUNKS, use_chunks); String output_table = null;//from ww w . j a v a 2 s . c o m if (use_chunks) { if (use_shards) output_table = "snmp_with_shards_and_chunks"; else output_table = "snmp_with_chunks"; } else { if (use_shards) output_table = "snmpWithShards"; else output_table = "snmp_no_splits"; } MongoConfigUtil.setOutputURI(Conf, "mongodb://localhost:30000/test." + output_table); final Job snmpJob = new Job(Conf, "snmp analysis " + output_table); snmpJob.setJarByClass(SnmpStatisticWithCombiner.class); snmpJob.setMapperClass(MapHostUploadOnEachAPPerDay.class); snmpJob.setCombinerClass(CombineHostUploadOnEachAPPerDay.class); snmpJob.setReducerClass(ReduceHostUploadOnEachAPPerDay.class); snmpJob.setOutputKeyClass(Text.class); snmpJob.setOutputValueClass(LongWritable.class); snmpJob.setInputFormatClass(MongoInputFormat.class); snmpJob.setOutputFormatClass(MongoOutputFormat.class); try { boolean result = snmpJob.waitForCompletion(true); System.out.println("job.waitForCompletion( true ) returned " + result); } catch (Exception e) { System.out.println("job.waitForCompletion( true ) threw Exception"); e.printStackTrace(); } }