List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name)
name
property, null
if no such property exists. From source file:com.hyperiongray.ccmr.s3wordcount.WordCountOnlyMapper.java
License:Apache License
public void configure(JobConf job) { String keywordsfileContent = job.get("keywordsFileContent"); contentMatcher = new ContentMatcher(keywordsfileContent); sampleSize = job.getInt("sampleSize", 100); logger.info("Running with sampleSize of:" + sampleSize); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParWorkerReducer.java
License:Open Source License
/** * //from ww w . j av a 2 s . c o m */ @Override public void configure(JobConf job) { //Step 1: configure data partitioning information _rlen = (int) MRJobConfiguration.getPartitioningNumRows(job); _clen = (int) MRJobConfiguration.getPartitioningNumCols(job); _brlen = MRJobConfiguration.getPartitioningBlockNumRows(job); _bclen = MRJobConfiguration.getPartitioningBlockNumCols(job); _iterVar = MRJobConfiguration.getPartitioningItervar(job); _inputVar = MRJobConfiguration.getPartitioningMatrixvar(job); _dpf = MRJobConfiguration.getPartitioningFormat(job); switch (_dpf) { //create matrix partition for reuse case ROW_WISE: _rlen = 1; break; case COLUMN_WISE: _clen = 1; break; default: throw new RuntimeException("Partition format not yet supported in fused partition-execute: " + _dpf); } _info = MRJobConfiguration.getPartitioningOutputInfo(job); _tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job); if (_tSparseCol) _partition = new MatrixBlock((int) _clen, _rlen, true); else _partition = new MatrixBlock((int) _rlen, _clen, false); //Step 1: configure parworker String taskID = job.get("mapred.tip.id"); LOG.trace("configure RemoteDPParWorkerReducer " + taskID); try { _stringID = taskID; _workerID = IDHandler.extractIntID(_stringID); //int task ID //use the given job configuration as source for all new job confs //NOTE: this is required because on HDP 2.3, the classpath of mr tasks contained hadoop-common.jar //which includes a core-default.xml configuration which hides the actual default cluster configuration //in the context of mr jobs (for example this config points to local fs instead of hdfs by default). if (!InfrastructureAnalyzer.isLocalMode(job)) { ConfigurationManager.setCachedJobConf(job); } //create local runtime program String in = MRJobConfiguration.getProgramBlocks(job); ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID); _childBlocks = body.getChildBlocks(); _ec = body.getEc(); _resultVars = body.getResultVarNames(); //init local cache manager if (!CacheableData.isCachingActive()) { String uuid = IDHandler.createDistributedUniqueID(); LocalFileUtils.createWorkingDirectoryWithUUID(uuid); CacheableData.initCaching(uuid); //incl activation, cache dir creation (each map task gets its own dir for simplified cleanup) } if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) { //account for local mode CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID; } //ensure that resultvar files are not removed super.pinResultVariables(); //enable/disable caching (if required) boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job); if (!cpCaching) CacheableData.disableCaching(); _numTasks = 0; _numIters = 0; } catch (Exception ex) { throw new RuntimeException(ex); } //disable parfor stat monitoring, reporting execution times via counters not useful StatisticMonitor.disableStatMonitoring(); //always reset stats because counters per map task (for case of JVM reuse) if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) { CacheStatistics.reset(); Statistics.reset(); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java
License:Open Source License
/** * /*ww w. java 2 s .com*/ * @param pfid * @param program * @param taskFile * @param resultFile * @param _enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile, MatrixObject colocatedDPMatrixObj, //inputs boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-EMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the mapper MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //set mappers, reducers, combiners job.setMapperClass(RemoteParWorkerMapper.class); //map-only //set input format (one split per row, NLineInputFormat default N=1) if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) { job.setInputFormat(RemoteParForColocatedNLineInputFormat.class); MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat()); MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics(); MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock()); MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock()); MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName()); } else //default case { job.setInputFormat(NLineInputFormat.class); } //set the input path and output path FileInputFormat.setInputPaths(job, new Path(taskFile)); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumMapTasks(numMappers); //numMappers job.setNumReduceTasks(0); //job.setInt("mapred.map.tasks.maximum", 1); //system property //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property //use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.priority", 0); //highest job.setInt("flex.map.min", 0); job.setInt("flex.map.max", numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", numMappers); } //set jvm memory size (if require) String memKey = "mapred.child.java.opts"; if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) { InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem); LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M."); } //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //enables the reuse of JVMs (multiple tasks per MR task) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption) job.setInt("io.sort.mb", 8); //8MB //set the replication factor for the results job.setInt("dfs.replication", replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job); MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParWorkerMapper.java
License:Open Source License
/** * /*ww w.j a v a 2 s .c o m*/ */ @Override public void configure(JobConf job) { boolean requiresConfigure = true; String jobID = job.get("mapred.job.id"); //probe cache for existing worker (parfor body, symbol table, etc) if (ParForProgramBlock.ALLOW_REUSE_MR_PAR_WORKER) { synchronized (_sCache) //for multiple jobs in local mode { if (_sCache.containsKey(jobID)) { RemoteParWorkerMapper tmp = _sCache.get(jobID); _stringID = tmp._stringID; _workerID = tmp._workerID; _childBlocks = tmp._childBlocks; _resultVars = tmp._resultVars; _ec = tmp._ec; _numIters = tmp._numIters; _numTasks = tmp._numTasks; _rvarFnames = tmp._rvarFnames; requiresConfigure = false; } } } if (requiresConfigure) { LOG.trace("configure RemoteParWorkerMapper " + job.get("mapred.tip.id")); try { _stringID = job.get("mapred.tip.id"); //task ID _workerID = IDHandler.extractIntID(_stringID); //int task ID //use the given job configuration as source for all new job confs //NOTE: this is required because on HDP 2.3, the classpath of mr tasks contained hadoop-common.jar //which includes a core-default.xml configuration which hides the actual default cluster configuration //in the context of mr jobs (for example this config points to local fs instead of hdfs by default). if (!InfrastructureAnalyzer.isLocalMode(job)) { ConfigurationManager.setCachedJobConf(job); } //create local runtime program String in = MRJobConfiguration.getProgramBlocks(job); ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID); _childBlocks = body.getChildBlocks(); _ec = body.getEc(); _resultVars = body.getResultVarNames(); //init local cache manager if (!CacheableData.isCachingActive()) { String uuid = IDHandler.createDistributedUniqueID(); LocalFileUtils.createWorkingDirectoryWithUUID(uuid); CacheableData.initCaching(uuid); //incl activation, cache dir creation (each map task gets its own dir for simplified cleanup) } if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) { //account for local mode CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_" + _workerID; } //ensure that resultvar files are not removed super.pinResultVariables(); //enable/disable caching (if required) boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job); if (!cpCaching) CacheableData.disableCaching(); _numTasks = 0; _numIters = 0; } catch (Exception ex) { throw new RuntimeException(ex); } //disable stat monitoring, reporting execution times via counters not useful StatisticMonitor.disableStatMonitoring(); //put into cache if required if (ParForProgramBlock.ALLOW_REUSE_MR_PAR_WORKER) synchronized (_sCache) { //for multiple jobs in local mode _sCache.put(jobID, this); } } else { LOG.trace("reuse configured RemoteParWorkerMapper " + _stringID); } //always reset stats because counters per map task (for case of JVM reuse) if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) { CacheStatistics.reset(); Statistics.reset(); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMapper.java
License:Open Source License
public void configure(JobConf job) { InputInfo ii = MRJobConfiguration.getResultMergeInputInfo(job); long[] tmp = MRJobConfiguration.getResultMergeMatrixCharacteristics(job); String compareFname = MRJobConfiguration.getResultMergeInfoCompareFilename(job); String currentFname = job.get("map.input.file"); byte tag = 0; //startsWith comparison in order to account for part names in currentFname if (currentFname.startsWith(compareFname)) tag = ResultMergeRemoteMR.COMPARE_TAG; else//from w ww . jav a 2 s.com tag = ResultMergeRemoteMR.DATA_TAG; if (ii == InputInfo.TextCellInputInfo) _mapper = new ResultMergeMapperTextCell(tag); else if (ii == InputInfo.BinaryCellInputInfo) _mapper = new ResultMergeMapperBinaryCell(tag); else if (ii == InputInfo.BinaryBlockInputInfo) _mapper = new ResultMergeMapperBinaryBlock(tag, tmp[0], tmp[1], tmp[2], tmp[3]); else throw new RuntimeException("Unable to configure mapper with unknown input info: " + ii.toString()); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer.java
License:Open Source License
/** * /*w w w . java 2s .c om*/ * @param job * @param key * @param bytes */ public static void setMaxMemoryOpt(JobConf job, String key, long bytes) { String javaOptsOld = job.get(key); String javaOptsNew = null; //StringTokenizer st = new StringTokenizer( javaOptsOld, " " ); String[] tokens = javaOptsOld.split(" "); //account also for no ' ' StringBuilder sb = new StringBuilder(); for (String arg : tokens) { if (arg.startsWith("-Xmx")) //search for max mem { sb.append("-Xmx"); sb.append((bytes / (1024 * 1024))); sb.append("M"); } else sb.append(arg); sb.append(" "); } javaOptsNew = sb.toString().trim(); job.set(key, javaOptsNew); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer.java
License:Open Source License
/** * Analyzes only properties of hadoop configuration in order to prevent * expensive call to cluster status ./*from www.jav a 2s. c o m*/ */ private static void analyzeHadoopConfiguration() { JobConf job = ConfigurationManager.getCachedJobConf(); _remoteMRSortMem = (1024 * 1024) * job.getLong("io.sort.mb", 100); //1MB //handle jvm max mem (map mem budget is relevant for map-side distcache and parfor) //(for robustness we probe both: child and map configuration parameters) String javaOpts1 = job.get("mapred.child.java.opts"); //internally mapred/mapreduce synonym String javaOpts2 = job.get("mapreduce.map.java.opts", null); //internally mapred/mapreduce synonym String javaOpts3 = job.get("mapreduce.reduce.java.opts", null); //internally mapred/mapreduce synonym if (javaOpts2 != null) //specific value overrides generic _remoteJVMMaxMemMap = extractMaxMemoryOpt(javaOpts2); else _remoteJVMMaxMemMap = extractMaxMemoryOpt(javaOpts1); if (javaOpts3 != null) //specific value overrides generic _remoteJVMMaxMemReduce = extractMaxMemoryOpt(javaOpts3); else _remoteJVMMaxMemReduce = extractMaxMemoryOpt(javaOpts1); //HDFS blocksize String blocksize = job.get(MRConfigurationNames.DFS_BLOCK_SIZE, "134217728"); _blocksize = Long.parseLong(blocksize); //is yarn enabled String framework = job.get("mapreduce.framework.name"); _yarnEnabled = (framework != null && framework.equals("yarn")); //analyze if local mode (internally requires yarn_enabled) _localJT = analyzeLocalMode(job); }
From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} to the list of * inputs for the map-reduce job./*from w w w. ja v a 2 s . c o m*/ * * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for the job * @param inputFormatClass {@link InputFormat} class to use for this path */ public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass) { String inputFormatMapping = path.toString() + ";" + inputFormatClass.getName(); String inputFormats = conf.get("mapred.input.dir.formats"); conf.set("mapred.input.dir.formats", inputFormats == null ? inputFormatMapping : inputFormats + "," + inputFormatMapping); conf.setInputFormat(DelegatingInputFormat.class); }
From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} and * {@link Mapper} to the list of inputs for the map-reduce job. * //ww w .j a va2s .c o m * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for the job * @param inputFormatClass {@link InputFormat} class to use for this path * @param mapperClass {@link Mapper} class to use for this path */ public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass) { addInputPath(conf, path, inputFormatClass); String mapperMapping = path.toString() + ";" + mapperClass.getName(); String mappers = conf.get("mapred.input.dir.mappers"); conf.set("mapred.input.dir.mappers", mappers == null ? mapperMapping : mappers + "," + mapperMapping); conf.setMapperClass(DelegatingMapper.class); }
From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java
License:Apache License
/** * Retrieves a map of {@link Path}s to the {@link InputFormat} class * that should be used for them.// w ww . ja va 2 s .co m * * @param conf The confuration of the job * @see #addInputPath(JobConf, Path, Class) * @return A map of paths to inputformats for the job */ static Map<Path, InputFormat> getInputFormatMap(JobConf conf) { Map<Path, InputFormat> m = new HashMap<Path, InputFormat>(); String[] pathMappings = conf.get("mapred.input.dir.formats").split(","); for (String pathMapping : pathMappings) { String[] split = pathMapping.split(";"); InputFormat inputFormat; try { inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } m.put(new Path(split[0]), inputFormat); } return m; }