List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:cn.itcast.hadoop.mr.wordcount.DBCountPageView.java
License:Apache License
@Override //Usage DBCountPageView [driverClass dburl] public int run(String[] args) throws Exception { //?MySql/*from w w w . j av a2s .c om*/ String driverClassName = DRIVER_CLASS; String url = DB_URL; //?? //???? if (args.length > 1) { driverClassName = args[0]; url = args[1]; } //driverClassNameurl?? initialize(driverClassName, url); //hdfs? Configuration conf = getConf(); //?? DBConfiguration.configureDB(conf, driverClassName, url); //??? //job Job job = Job.getInstance(conf); //job?? job.setJobName("Count Pageviews of URLs"); //job job.setJarByClass(DBCountPageView.class); //Map job.setMapperClass(PageviewMapper.class); //Combiner job.setCombinerClass(LongSumReducer.class); //reduce job.setReducerClass(PageviewReducer.class); //DB? // setInput(Job job, Class<? extends DBWritable> inputClass, String tableName, String conditions, String orderBy, String... fieldNames) DBInputFormat.setInput(job, AccessRecord.class, "HAccess", null, "url", AccessFieldNames); //? //FileOutputFormat.setoutput ? DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);// //Mapkey? job.setMapOutputKeyClass(Text.class); //MapValue? job.setMapOutputValueClass(LongWritable.class); //Reducekey? job.setOutputKeyClass(PageviewRecord.class); //Reducevalue? job.setOutputValueClass(NullWritable.class); int ret;//job????? try { ret = job.waitForCompletion(true) ? 0 : 1; boolean correct = verify(); if (!correct) { throw new RuntimeException("Evaluation was not correct!"); } } finally { shutdown(); } return ret; }
From source file:cn.jpush.hdfs.mr.example.BaileyBorweinPlouffe.java
License:Apache License
/** Create and setup a job */ @SuppressWarnings("deprecation") private static Job createJob(String name, Configuration conf) throws IOException { final Job job = new Job(conf, NAME + "_" + name); final Configuration jobconf = job.getConfiguration(); job.setJarByClass(BaileyBorweinPlouffe.class); // setup mapper job.setMapperClass(BbpMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BytesWritable.class); // setup reducer job.setReducerClass(BbpReducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BytesWritable.class); job.setNumReduceTasks(1);/*from ww w .j av a 2 s . c o m*/ // setup input job.setInputFormatClass(BbpInputFormat.class); // disable task timeout jobconf.setLong(MRJobConfig.TASK_TIMEOUT, 0); // do not use speculative execution jobconf.setBoolean(MRJobConfig.MAP_SPECULATIVE, false); jobconf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, false); return job; }
From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java
License:Apache License
private void runMR(File inputDir, File outputDir, long startTime, long endTime, long splitSize, long ttl) throws Exception { Job job = Job.getInstance(); Configuration conf = job.getConfiguration(); StreamInputFormat.setTTL(conf, ttl); StreamInputFormat.setStreamPath(conf, inputDir.toURI()); StreamInputFormat.setTimeRange(conf, startTime, endTime); StreamInputFormat.setMaxSplitSize(conf, splitSize); job.setInputFormatClass(TestStreamInputFormat.class); TextOutputFormat.setOutputPath(job, new Path(outputDir.toURI())); job.setOutputFormatClass(TextOutputFormat.class); job.setJarByClass(StreamInputFormatTest.class); job.setMapperClass(TokenizeMapper.class); job.setReducerClass(AggregateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setMapOutputValueClass(IntWritable.class); job.waitForCompletion(true);//from w w w . ja va 2s. c o m }
From source file:co.cask.cdap.etl.batch.mapreduce.ETLMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) { LogStageInjector.start();//from ww w.ja va 2s .co m } CompositeFinisher.Builder finishers = CompositeFinisher.builder(); Job job = context.getHadoopJob(); Configuration hConf = job.getConfiguration(); // plugin name -> runtime args for that plugin Map<String, Map<String, String>> runtimeArgs = new HashMap<>(); Map<String, String> properties = context.getSpecification().getProperties(); BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class); PipelinePhase phase = phaseSpec.getPhase(); PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, phaseSpec); // we checked at configure time that there is exactly one source String sourceName = phaseSpec.getPhase().getSources().iterator().next(); BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName); batchSource = new LoggedBatchConfigurable<>(sourceName, batchSource); BatchSourceContext sourceContext = new MapReduceSourceContext(context, mrMetrics, new DatasetContextLookupProvider(context), sourceName, context.getRuntimeArguments()); batchSource.prepareRun(sourceContext); runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments()); finishers.add(batchSource, sourceContext); Map<String, SinkOutput> sinkOutputs = new HashMap<>(); for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) { String sinkName = stageInfo.getName(); // todo: add a better way to get info for all sinks if (!phase.getSinks().contains(sinkName)) { continue; } BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName); batchSink = new LoggedBatchConfigurable<>(sinkName, batchSink); MapReduceSinkContext sinkContext = new MapReduceSinkContext(context, mrMetrics, new DatasetContextLookupProvider(context), sinkName, context.getRuntimeArguments()); batchSink.prepareRun(sinkContext); runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments()); finishers.add(batchSink, sinkContext); sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName())); } finisher = finishers.build(); hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs)); // setup time partition for each error dataset for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) { if (stageInfo.getErrorDatasetName() != null) { Map<String, String> args = new HashMap<>(); args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString()); TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime()); context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args)); } } job.setMapperClass(ETLMapper.class); Set<StageInfo> aggregators = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE); if (!aggregators.isEmpty()) { job.setReducerClass(ETLReducer.class); String aggregatorName = aggregators.iterator().next().getName(); BatchAggregator aggregator = pluginInstantiator.newPluginInstance(aggregatorName); MapReduceAggregatorContext aggregatorContext = new MapReduceAggregatorContext(context, mrMetrics, new DatasetContextLookupProvider(context), aggregatorName, context.getRuntimeArguments()); aggregator.prepareRun(aggregatorContext); finishers.add(aggregator, aggregatorContext); if (aggregatorContext.getNumPartitions() != null) { job.setNumReduceTasks(aggregatorContext.getNumPartitions()); } // if the plugin sets the output key and value class directly, trust them Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass(); Class<?> outputValClass = aggregatorContext.getGroupValueClass(); // otherwise, derive it from the plugin's parameters if (outputKeyClass == null) { outputKeyClass = TypeChecker.getGroupKeyClass(aggregator); } if (outputValClass == null) { outputValClass = TypeChecker.getGroupValueClass(aggregator); } hConf.set(GROUP_KEY_CLASS, outputKeyClass.getName()); hConf.set(GROUP_VAL_CLASS, outputValClass.getName()); // in case the classes are not a WritableComparable, but is some common type we support // for example, a String or a StructuredRecord WritableConversion writableConversion = WritableConversions.getConversion(outputKeyClass.getName()); // if the conversion is null, it means the user is using their own object. if (writableConversion != null) { outputKeyClass = writableConversion.getWritableClass(); } writableConversion = WritableConversions.getConversion(outputValClass.getName()); if (writableConversion != null) { outputValClass = writableConversion.getWritableClass(); } // check classes here instead of letting mapreduce do it, since mapreduce throws a cryptic error if (!WritableComparable.class.isAssignableFrom(outputKeyClass)) { throw new IllegalArgumentException(String.format( "Invalid aggregator %s. The group key class %s must implement Hadoop's WritableComparable.", aggregatorName, outputKeyClass)); } if (!Writable.class.isAssignableFrom(outputValClass)) { throw new IllegalArgumentException(String.format( "Invalid aggregator %s. The group value class %s must implement Hadoop's Writable.", aggregatorName, outputValClass)); } job.setMapOutputKeyClass(outputKeyClass); job.setMapOutputValueClass(outputValClass); } else { job.setNumReduceTasks(0); } hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs)); }
From source file:co.cask.cdap.examples.streamconversion.StreamConversionMapReduce.java
License:Apache License
@Override public void beforeSubmit(MapReduceContext context) throws Exception { Job job = context.getHadoopJob(); job.setMapperClass(StreamConversionMapper.class); job.setNumReduceTasks(0);/*from ww w.j a v a 2 s . c o m*/ job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, SCHEMA); // read 5 minutes of events from the stream, ending at the logical start time of this run long logicalTime = context.getLogicalStartTime(); context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime)); // each run writes its output to a partition with the logical start time. TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime); context.addOutput(Output.ofDataset("converted", dsArguments)); TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments); LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation()); }
From source file:co.cask.cdap.internal.app.runtime.batch.AggregateMetricsByTag.java
License:Apache License
static void configureJob(Job job) throws IOException { job.setMapperClass(Map.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(Reduce.class); }
From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java
License:Apache License
/** * Sets the map output key and value classes in the job configuration by inspecting the {@link Mapper} * if it is not set by the user.//w w w . j a v a 2s . co m * * @param job the MapReduce job * @param mapperTypeToken TypeToken of a configured mapper (may not be configured on the job). Has already been * resolved from the job's mapper class. */ private void setMapOutputClassesIfNeeded(Job job, @Nullable TypeToken<?> mapperTypeToken) { Configuration conf = job.getConfiguration(); TypeToken<?> type = mapperTypeToken; int keyIdx = 2; int valueIdx = 3; if (type == null) { // Reducer only job. Use the Reducer input types as the key/value classes. type = resolveClass(conf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class); keyIdx = 0; valueIdx = 1; } // If not able to detect type, nothing to set. if (type == null || !(type.getType() instanceof ParameterizedType)) { return; } Type[] typeArgs = ((ParameterizedType) type.getType()).getActualTypeArguments(); // Set it only if the user didn't set it in beforeSubmit // The key and value type are in the 3rd and 4th type parameters if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS)) { Class<?> cls = TypeToken.of(typeArgs[keyIdx]).getRawType(); LOG.debug("Set map output key class to {}", cls); job.setMapOutputKeyClass(cls); } if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS)) { Class<?> cls = TypeToken.of(typeArgs[valueIdx]).getRawType(); LOG.debug("Set map output value class to {}", cls); job.setMapOutputValueClass(cls); } }
From source file:co.nubetech.hiho.dedup.DedupJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); populateConfiguration(args);//w ww . j av a2 s . c o m try { checkMandatoryConfs(); } catch (HIHOException e1) { e1.printStackTrace(); throw new Exception(e1); } Job job = new Job(conf); job.setJobName("Dedup job"); job.setJarByClass(DedupJob.class); Class inputFormatClass = Class.forName(inputFormat); Class outputFormatClass = Class.forName(outputFormat); Class inputKeyClass = Class.forName(inputKeyClassName); Class inputValueClass = Class.forName(inputValueClassName); if (dedupBy.equals("key")) { job.setMapperClass(DedupKeyMapper.class); job.setReducerClass(DedupKeyReducer.class); job.setMapOutputValueClass(inputValueClass); } else if (dedupBy.equals("value")) { job.setMapperClass(DedupValueMapper.class); job.setReducerClass(DedupValueReducer.class); job.setMapOutputValueClass(inputKeyClass); } job.setInputFormatClass(inputFormatClass); if (inputFormat.equals("co.nubetech.hiho.dedup.DelimitedTextInputFormat")) { DelimitedTextInputFormat.setProperties(job, delimiter, column); } job.setMapOutputKeyClass(HihoTuple.class); job.setOutputKeyClass(inputKeyClass); job.setOutputValueClass(inputValueClass); job.setPartitionerClass(HihoHashPartitioner.class); FileInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(outputPath)); try { logger.debug("Output format class is " + job.getOutputFormatClass()); logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (job.isComplete()) { Counters counters = job.getCounters(); totalRecordsRead = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue(); badRecords = counters.findCounter(DedupRecordCounter.BAD_RECORD).getValue(); output = counters.findCounter(DedupRecordCounter.OUTPUT).getValue(); duplicateRecords = totalRecordsRead - output; logger.info("Total records read are: " + totalRecordsRead); logger.info("Bad Records are: " + badRecords); logger.info("Output records are: " + output); logger.info("Duplicate records are: " + duplicateRecords); } } catch (Exception e) { e.printStackTrace(); } return 0; }
From source file:co.nubetech.hiho.job.DBQueryInputJob.java
License:Apache License
public void runJobs(Configuration conf, int jobCounter) throws IOException { try {/* w w w . ja v a 2s. c om*/ checkMandatoryConfs(conf); } catch (HIHOException e1) { e1.printStackTrace(); throw new IOException(e1); } Job job = new Job(conf); for (Entry<String, String> entry : conf) { logger.warn("key, value " + entry.getKey() + "=" + entry.getValue()); } // logger.debug("Number of maps " + // conf.getInt("mapred.map.tasks", 1)); // conf.setInt(JobContext.NUM_MAPS, // conf.getInt("mapreduce.job.maps", 1)); // job.getConfiguration().setInt("mapred.map.tasks", 4); job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1)); logger.warn("Number of maps " + conf.getInt(MRJobConfig.NUM_MAPS, 1)); job.setJobName("Import job"); job.setJarByClass(DBQueryInputJob.class); String strategy = conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY); OutputStrategyEnum os = OutputStrategyEnum.value(strategy); if (os == null) { throw new IllegalArgumentException("Wrong value of output strategy. Please correct"); } if (os != OutputStrategyEnum.AVRO) { switch (os) { case DUMP: { // job.setMapperClass(DBImportMapper.class); break; } /* * case AVRO: { job.setMapperClass(DBInputAvroMapper.class); // * need avro in cp // job.setJarByClass(Schema.class); // need * jackson which is needed by avro - ugly! // * job.setJarByClass(ObjectMapper.class); * job.setMapOutputKeyClass(NullWritable.class); * job.setMapOutputValueClass(AvroValue.class); * job.setOutputKeyClass(NullWritable.class); * job.setOutputValueClass(AvroValue.class); * job.setOutputFormatClass(AvroOutputFormat.class); * * AvroOutputFormat.setOutputPath(job, new * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; } */ case DELIMITED: { job.setMapperClass(DBInputDelimMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class); NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); } case JSON: { // job.setMapperClass(DBImportJsonMapper.class); // job.setJarByClass(ObjectMapper.class); break; } default: { job.setMapperClass(DBInputDelimMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class); NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; } } String inputQuery = conf.get(DBConfiguration.INPUT_QUERY); String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY); logger.debug("About to set the params"); DBQueryInputFormat.setInput(job, inputQuery, inputBoundingQuery, params); logger.debug("Set the params"); job.setNumReduceTasks(0); try { // job.setJarByClass(Class.forName(conf.get( // org.apache.hadoop.mapred.lib.db.DBConfiguration.DRIVER_CLASS_PROPERTY))); logger.debug("OUTPUT format class is " + job.getOutputFormatClass()); /* * org.apache.hadoop.mapreduce.OutputFormat<?, ?> output = * ReflectionUtils.newInstance(job.getOutputFormatClass(), * job.getConfiguration()); output.checkOutputSpecs(job); */ logger.debug("Class is " + ReflectionUtils .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName()); job.waitForCompletion(false); if (conf.get(HIHOConf.INPUT_OUTPUT_LOADTO) != null) { generateHiveScript(conf, job, jobCounter); generatePigScript(conf, job); } } /* * catch (HIHOException h) { h.printStackTrace(); } */ catch (Exception e) { e.printStackTrace(); } catch (HIHOException e) { e.printStackTrace(); } } // avro to be handled differently, thanks to all the incompatibilities // in the apis. else { String inputQuery = conf.get(DBConfiguration.INPUT_QUERY); String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY); logger.debug("About to set the params"); // co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(job, // inputQuery, inputBoundingQuery, params); logger.debug("Set the params"); JobConf jobConf = new JobConf(conf); try { GenericDBWritable queryWritable = getDBWritable(jobConf); Schema pair = DBMapper.getPairSchema(queryWritable.getColumns()); AvroJob.setMapOutputSchema(jobConf, pair); GenericRecordAvroOutputFormat.setOutputPath(jobConf, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(jobConf, inputQuery, inputBoundingQuery, params); jobConf.setInputFormat(co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.class); jobConf.setMapperClass(DBInputAvroMapper.class); jobConf.setMapOutputKeyClass(NullWritable.class); jobConf.setMapOutputValueClass(AvroValue.class); jobConf.setOutputKeyClass(NullWritable.class); jobConf.setOutputValueClass(Text.class); jobConf.setOutputFormat(GenericRecordAvroOutputFormat.class); jobConf.setJarByClass(DBQueryInputJob.class); jobConf.setStrings("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization,org.apache.avro.mapred.AvroSerialization"); jobConf.setNumReduceTasks(0); /* * jobConf.setOutputFormat(org.apache.hadoop.mapred. * SequenceFileOutputFormat.class); * org.apache.hadoop.mapred.SequenceFileOutputFormat * .setOutputPath(jobConf, new * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); */ JobClient.runJob(jobConf); } catch (Throwable e) { e.printStackTrace(); } } }
From source file:co.nubetech.hiho.job.ExportDelimitedToDB.java
License:Apache License
public int run(String[] args) throws IOException { Configuration conf = getConf(); Job job = new Job(conf); job.setJobName("MySQLBulkLoading"); job.setMapperClass(DelimitedLoadMapper.class); job.setJarByClass(DelimitedLoadMapper.class); job.setNumReduceTasks(0);/*from w w w . ja va2 s . co m*/ job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(args[0])); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputFormatClass(DBOutputFormat.class); int ret = 0; try { ret = job.waitForCompletion(true) ? 0 : 1; } catch (Exception e) { e.printStackTrace(); } return ret; }