List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException
From source file:cn.edu.hfut.dmic.webcollector.crawldb.Merge.java
public static void merge(Path crawlPath, Path[] mergePaths, Configuration conf, String jobName) throws Exception { Job job = new Job(conf); job.setJobName(jobName + " " + crawlPath.toString()); job.setJarByClass(Merge.class); // job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path crawldbPath = new Path(crawlPath, "crawldb"); Path newdb = new Path(crawldbPath, "new"); Path currentdb = new Path(crawldbPath, "current"); FileSystem fs = FileSystem.get(conf); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }//from w w w . ja v a2 s. c o m if (fs.exists(newdb)) { fs.delete(newdb); } for (Path mergePath : mergePaths) { FileInputFormat.addInputPath(job, mergePath); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); }
From source file:cn.edu.hfut.dmic.webcollector.fetcher.Fetcher.java
public static void fetch(Path crawlPath, String segmentName, Configuration conf) throws Exception { Path segmentPath = new Path(crawlPath, "segments/" + segmentName); Path generatePath = new Path(segmentPath, "generate"); Job job = new Job(conf); job.setJobName("fetch " + crawlPath.toString()); job.setJarByClass(Fetcher.class); job.setReducerClass(FetcherReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(FetcherOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); FileInputFormat.addInputPath(job, generatePath); FileOutputFormat.setOutputPath(job, segmentPath); job.waitForCompletion(true);/*from w w w.j a va 2s . c o m*/ }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Merge.java
public static Job createJob(Configuration conf, Path crawldb) throws IOException { Job job = new Job(conf); //job.setJarByClass(Merge.class); job.getConfiguration().set("mapred", "/home/hu/mygit/WebCollector2/WebCollectorCluster/target/WebCollectorCluster-2.0.jar"); Path newdb = new Path(crawldb, "new"); Path currentdb = new Path(crawldb, "current"); FileSystem fs = crawldb.getFileSystem(CrawlerConfiguration.create()); if (fs.exists(currentdb)) { FileInputFormat.addInputPath(job, currentdb); }//from w w w . jav a 2 s . co m if (fs.exists(newdb)) { fs.delete(newdb); } FileOutputFormat.setOutputPath(job, newdb); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setMapperClass(MergeMap.class); job.setReducerClass(MergeReduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlDatum.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job; }
From source file:cn.itcast.hadoop.mr.wordcount.DBCountPageView.java
License:Apache License
@Override //Usage DBCountPageView [driverClass dburl] public int run(String[] args) throws Exception { //?MySql/*from w ww .ja v a 2 s .c om*/ String driverClassName = DRIVER_CLASS; String url = DB_URL; //?? //???? if (args.length > 1) { driverClassName = args[0]; url = args[1]; } //driverClassNameurl?? initialize(driverClassName, url); //hdfs? Configuration conf = getConf(); //?? DBConfiguration.configureDB(conf, driverClassName, url); //??? //job Job job = Job.getInstance(conf); //job?? job.setJobName("Count Pageviews of URLs"); //job job.setJarByClass(DBCountPageView.class); //Map job.setMapperClass(PageviewMapper.class); //Combiner job.setCombinerClass(LongSumReducer.class); //reduce job.setReducerClass(PageviewReducer.class); //DB? // setInput(Job job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String orderBy, String... fieldNames) DBInputFormat.setInput(job, AccessRecord.class, "HAccess", null, "url", AccessFieldNames); //? //FileOutputFormat.setoutput ? DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);// //Mapkey? job.setMapOutputKeyClass(Text.class); //MapValue? job.setMapOutputValueClass(LongWritable.class); //Reducekey? job.setOutputKeyClass(PageviewRecord.class); //Reducevalue? job.setOutputValueClass(NullWritable.class); int ret;//job????? try { ret = job.waitForCompletion(true) ? 0 : 1; boolean correct = verify(); if (!correct) { throw new RuntimeException("Evaluation was not correct!"); } } finally { shutdown(); } return ret; }
From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java
License:Apache License
/** Create and setup a job */ @SuppressWarnings("deprecation") private static Job createJob(String name, Configuration conf) throws IOException { final Job job = new Job(conf, NAME + "_" + name); final Configuration jobconf = job.getConfiguration(); job.setJarByClass(BaileyBorweinPlouffe.class); // setup mapper job.setMapperClass(BbpMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BytesWritable.class); // setup reducer job.setReducerClass(BbpReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BytesWritable.class); job.setNumReduceTasks(1);//from w ww . jav a 2s . co m // setup input job.setInputFormatClass(BbpInputFormat.class); // disable task timeout jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0); // do not use speculative execution jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false); jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false); return job; }
From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) { LogStageInjector.start();/*from www . j a v a2 s .c o m*/ } CompositeFinisher.Builder finishers = CompositeFinisher.builder(); Job job = context.getHadoopJob(); Configuration hConf = job.getConfiguration(); // plugin name -> runtime args for that plugin Map<String, Map<String, String>> runtimeArgs = new HashMap<>(); Map<String, String> properties = context.getSpecification().getProperties(); BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class); PipelinePhase phase = phaseSpec.getPhase(); PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec); // we checked at configure time that there is exactly one source String sourceName = phaseSpec.getPhase().getSources().iterator().next(); BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName); batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource); BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments()); batchSource.prepareRun(sourceContext); runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments()); finishers.add(batchSource, sourceContext); Map<String, SinkOutput> sinkOutputs = new HashMap<>(); for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) { String sinkName = stageInfo.getName(); // todo: add a better way to get info for all sinks if (!phase.getSinks().contains(sinkName)) { continue; } BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName); batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink); MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments()); batchSink.prepareRun(sinkContext); runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments()); finishers.add(batchSink, sinkContext); sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName())); } finisher = finishers.build(); hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs)); // setup time partition for each error dataset for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) { if (stageInfo.getErrorDatasetName() != null) { Map<String, String> args = new HashMap<>(); args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString()); TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime()); context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args)); } } job.setMapperClass(ETLMapper.class); Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE); if (!aggregators.isEmpty()) { job.setReducerClass(ETLReducer.class); String aggregatorName = aggregators.iterator().next().getName(); BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName); MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics, new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments()); aggregator.prepareRun(aggregatorContext); finishers.add(aggregator, aggregatorContext); if (aggregatorContext.getNumPartitions() != null) { job.setNumReduceTasks(aggregatorContext.getNumPartitions()); } // if the plugin sets the output key and value class directly, trust them Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass(); Class<?> outputValClass = aggregatorContext.getGroupValueClass(); // otherwise, derive it from the plugin's parameters if (outputKeyClass == null) { outputKeyClass = TypeChecker.getGroupKeyClass(aggregator); } if (outputValClass == null) { outputValClass = TypeChecker.getGroupValueClass(aggregator); } hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName()); hConf.set(GROUP_VAL_CLASS, outputValClass.getName()); // in case the classes are not a WritableComparable, but is some common type we support // for example, a String or a StructuredRecord WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName()); // if the conversion is null, it means the user is using their own object. if (writableConversion != null) { outputKeyClass = writableConversion.getWritableClass(); } writableConversion = WritableConversions.getConversion(outputValClass.getName()); if (writableConversion != null) { outputValClass = writableConversion.getWritableClass(); } // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) { throw new IllegalArgumentException(String.format( "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.", aggregatorName, outputKeyClass)); } if (!Writable.class.isAssignableFrom(outputValClass)) { throw new IllegalArgumentException(String.format( "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.", aggregatorName, outputValClass)); } job.setMapOutputKeyClass(outputKeyClass); job.setMapOutputValueClass(outputValClass); } else { job.setNumReduceTasks(0); } hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs)); }
From source file:co.cask.cdap.examples.streamconversion.StreamConversionMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(StreamConversionMapper.class); job.setNumReduceTasks(0);// w w w. j a va2 s. c o m job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, SCHEMA); // read 5 minutes of events from the stream, ending at the logical start time of this run long logicalTime = context.getLogicalStartTime(); context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime)); // each run writes its output to a partition with the logical start time. TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime); context.addOutput(Output.ofDataset("converted", dsArguments)); TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments); LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation()); }
From source file:co.cask.cdap.internal.app.runtime.batch.AggregateMetricsByTag.java
License:Apache License
static void configureJob(Job job) throws IOException { job.setMapperClass(Map.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(Reduce.class); }
From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java
License:Apache License
/** * Sets the map output key and value classes in the job configuration by inspecting the {@link Mapper} * if it is not set by the user.//from w w w. j a v a 2 s .c o m * * @param job the MapReduce job * @param mapperTypeToken TypeToken of a configured mapper (may not be configured on the job). Has already been * resolved from the job's mapper class. */ private void setMapOutputClassesIfNeeded(Job job, @Nullable TypeToken<?> mapperTypeToken) { Configuration conf = job.getConfiguration(); TypeToken<?> type = mapperTypeToken; int keyIdx = 2; int valueIdx = 3; if (type == null) { // Reducer only job. Use the Reducer input types as the key/value classes. type = resolveClass(conf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class); keyIdx = 0; valueIdx = 1; } // If not able to detect type, nothing to set. if (type == null || !(type.getType() instanceof ParameterizedType)) { return; } Type[] typeArgs = ((ParameterizedType) type.getType()).getActualTypeArguments(); // Set it only if the user didn't set it in beforeSubmit // The key and value type are in the 3rd and 4th type parameters if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS)) { Class<?> cls = TypeToken.of(typeArgs[keyIdx]).getRawType(); LOG.debug("Set map output key class to {}", cls); job.setMapOutputKeyClass(cls); } if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS)) { Class<?> cls = TypeToken.of(typeArgs[valueIdx]).getRawType(); LOG.debug("Set map output value class to {}", cls); job.setMapOutputValueClass(cls); } }
From source file:co.nubetech.hiho.dedup.DedupJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); populateConfiguration(args);//w w w .j av a 2 s .c o m try { checkMandatoryConfs(); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Job job = new Job(conf); job.setJobName("Dedup job"); job.setJarByClass(DedupJob.class); Class inputFormatClass = Class.forName(inputFormat); Class outputFormatClass = Class.forName(outputFormat); Class inputKeyClass = Class.forName(inputKeyClassName); Class inputValueClass = Class.forName(inputValueClassName); if (dedupBy.equals("key")) { job.setMapperClass(DedupKeyMapper.class); job.setReducerClass(DedupKeyReducer.class); job.setMapOutputValueClass(inputValueClass); } else if (dedupBy.equals("value")) { job.setMapperClass(DedupValueMapper.class); job.setReducerClass(DedupValueReducer.class); job.setMapOutputValueClass(inputKeyClass); } job.setInputFormatClass(inputFormatClass); if (inputFormat.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) { DelimitedTextInputFormat.setProperties(job, delimiter, column); } job.setMapOutputKeyClass(HihoTuple.class); job.setOutputKeyClass(inputKeyClass); job.setOutputValueClass(inputValueClass); job.setPartitionerClass(HihoHashPartitioner.class); FileInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(outputPath)); try { logger.debug("Output format class is " + job.getOutputFormatClass()); logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (job.isComplete()) { Counters counters = job.getCounters(); totalRecordsRead = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue(); badRecords = counters.findCounter(DedupRecordCounter.BAD_RECORD).getValue(); output = counters.findCounter(DedupRecordCounter.OUTPUT).getValue(); duplicateRecords = totalRecordsRead - output; logger.info("Total records read are: " + totalRecordsRead); logger.info("Bad Records are: " + badRecords); logger.info("Output records are: " + output); logger.info("Duplicate records are: " + duplicateRecords); } } catch (Exception e) { e.printStackTrace(); } return 0; }