List of usage examples for org.apache.hadoop.mapred JobConf setUseNewReducer
public void setUseNewReducer(boolean flag)
From source file:org.slc.sli.aggregation.mapreduce.map.ConfigurableMapReduceJob.java
License:Apache License
@SuppressWarnings("rawtypes") protected static JobConf finalizeConfig(JobConf jobConf, ConfigSections s) throws IOException { Class<? extends Mapper> mapperClass = JobConfiguration.mapper.getMapClass(mapper); Class<? extends Reducer> reducerClass = JobConfiguration.function.getReduceClass(reduceFunction); Map<String, String> idFields = s.getMapper().getMapIdFields(); // validate we have enough to continue boolean valid = true; if (mapperClass == null) { log.severe("Invalid map/reduce configuration detected : no mapper class specified."); valid = false;//from w w w.j av a 2 s .c o m } if (idFields == null) { idFields = new HashMap<String, String>(); log.severe("Invalid map/reduce configuration detected : no map id fields specified."); valid = false; } if (mapCollection == null) { log.severe("Invalid map/reduce configuration detected : no map collection specified."); valid = false; } if (mapQuery == null) { log.severe("Invalid map/reduce configuration detected : no map query specified."); valid = false; } if (mapFields == null) { log.severe("Invalid map/reduce configuration detected : no map input fields specified."); valid = false; } if (reducerClass == null) { log.severe("Invalid map/reduce configuration detected : no reducer class specified."); valid = false; } if (reduceCollection == null) { log.severe("Invalid map/reduce configuration detected : no reduce collection specified."); valid = false; } if (reduceField == null) { log.severe("Invalid map/reduce configuration detected : no reduce field specified."); valid = false; } if (!valid) { throw new IllegalArgumentException("Invalid mapper specified. Check log for details."); } jobConf.set("mapred.output.dir", String.format("%s-%s-%d", s.getMapper().getMapper(), s.getMetadata().getFunction(), System.currentTimeMillis())); jobConf.setJobName(s.getMetadata().getDescription() == null ? "M/R Job" : s.getMetadata().getDescription()); // enable speculative execution. Multiple mapper tasks are created for the same split. // First one to finish wins; the remaining tasks are terminated. jobConf.setSpeculativeExecution(true); jobConf.setUseNewMapper(true); jobConf.setUseNewReducer(true); /** * TODO -- decide if this is required. String id = conf.get("@ID@"); String tenantId = conf.get("@TENANT_ID@"); for (Map.Entry<String, Object> entry : query.entrySet()) { Object value = entry.getValue(); if (value instanceof String) { String s = (String) value; if (s.indexOf("@ID@") >= 0 && id != null) { s = s.replace("@ID@", id); query.put(entry.getKey(), s); } if (s.indexOf("@TENANT_ID@") >= 0 && tenantId != null) { s = s.replace("@TENANT_ID@", tenantId); query.put(entry.getKey(), s); } } } if (updateField.indexOf("@ID@") >= 0 && id != null) { updateField = updateField.replace("@ID@", id); } if (updateField.indexOf("@TENANT_ID@") >= 0 && tenantId != null) { updateField = updateField.replace("@TENANT_ID@", tenantId); } */ MongoConfigUtil.setQuery(jobConf, new BasicDBObject(mapQuery)); Map<String, Object> fullFields = new HashMap<String, Object>(); for (String f : idFields.values()) { fullFields.put(f, 1); } fullFields.putAll(mapFields); MongoConfigUtil.setFields(jobConf, new BasicDBObject(fullFields)); MongoConfigUtil.setInputKey(jobConf, idFields.get("id")); MongoConfigUtil.setInputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + mapCollection); MongoConfigUtil.setMapperOutputKey(jobConf, TenantAndIdEmittableKey.class); MongoConfigUtil.setMapperOutputValue(jobConf, BSONWritable.class); MongoConfigUtil.setOutputKey(jobConf, TenantAndIdEmittableKey.class); MongoConfigUtil.setOutputValue(jobConf, BSONWritable.class); // TODO - this probably should be configurable MongoConfigUtil.setReadSplitsFromSecondary(jobConf, true); MongoConfigUtil.setSplitSize(jobConf, 32); jobConf.setClass("mapred.input.key.class", TenantAndIdEmittableKey.class, EmittableKey.class); jobConf.setClass("mapred.input.value.class", BSONWritable.class, Object.class); jobConf.setClass("mapred.output.key.class", TenantAndIdEmittableKey.class, EmittableKey.class); jobConf.setClass("mapred.output.value.class", BSONWritable.class, Object.class); jobConf.setClass("mapreduce.inputformat.class", MongoTenantAndIdInputFormat.class, MongoInputFormat.class); jobConf.setClass("mapreduce.outputformat.class", MongoAggFormatter.class, MongoOutputFormat.class); MongoConfigUtil.setInputFormat(jobConf, MongoTenantAndIdInputFormat.class); MongoConfigUtil.setOutputFormat(jobConf, MongoAggFormatter.class); /** * Configure how hadoop calculates splits. * * We enable input splits to avoid having the entire job executed on a single hadoop node. * * We enable shard chunk splitting to allow mongo to specify how to split the input. * * We disable read splits from shards because we want hadoop connecting to mongos, not * mongod directly. This avoids incorrect results in situations where data is in the process * of migration at the same time hadoop is trying to read it. * * TODO - determine if we also need to set the input split key pattern. This depends * on how well data is distributed by _id. Setting the key pattern gives finer grained * control over how splits are calculated. */ MongoConfigUtil.setCreateInputSplits(jobConf, true); MongoConfigUtil.setShardChunkSplittingEnabled(jobConf, true); MongoConfigUtil.setReadSplitsFromShards(jobConf, false); MongoConfigUtil.setOutputURI(jobConf, "mongodb://" + MONGO_HOST + "/" + reduceCollection); jobConf.setJarByClass(JobConfiguration.class); MongoConfigUtil.setMapper(jobConf, mapperClass); jobConf.setClass(JobContext.MAP_CLASS_ATTR, mapperClass, Mapper.class); MongoConfigUtil.setReducer(jobConf, reducerClass); jobConf.setClass(JobContext.REDUCE_CLASS_ATTR, reducerClass, Reducer.class); // Set this relatively high to keep the total map execution time low. // Formula: 1.75 * (# nodes * max tasks) // TODO : replace this hardcoded value with one calculated from configuration information. jobConf.setNumReduceTasks(52); // Add the configuration itself to the JobConf. JobConfiguration.toHadoopConfiguration(s, jobConf); return jobConf; }