Example usage for org.apache.hadoop.mapred JobConf get

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf get.

Prototype

public String get(String name)

Source Link

Document

Get the value of the name property, null if no such property exists.

Usage

From source file:com.hyperiongray.ccmr.s3wordcount.WordCountOnlyMapper.java

License:Apache License

public void configure(JobConf job) {
    String keywordsfileContent = job.get("keywordsFileContent");
    contentMatcher = new ContentMatcher(keywordsfileContent);
    sampleSize = job.getInt("sampleSize", 100);
    logger.info("Running with sampleSize of:" + sampleSize);
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParWorkerReducer.java

License:Open Source License

/**
 * //from  ww  w .  j  av a 2 s . c  o m
 */
@Override
public void configure(JobConf job) {
    //Step 1: configure data partitioning information
    _rlen = (int) MRJobConfiguration.getPartitioningNumRows(job);
    _clen = (int) MRJobConfiguration.getPartitioningNumCols(job);
    _brlen = MRJobConfiguration.getPartitioningBlockNumRows(job);
    _bclen = MRJobConfiguration.getPartitioningBlockNumCols(job);
    _iterVar = MRJobConfiguration.getPartitioningItervar(job);
    _inputVar = MRJobConfiguration.getPartitioningMatrixvar(job);
    _dpf = MRJobConfiguration.getPartitioningFormat(job);
    switch (_dpf) { //create matrix partition for reuse
    case ROW_WISE:
        _rlen = 1;
        break;
    case COLUMN_WISE:
        _clen = 1;
        break;
    default:
        throw new RuntimeException("Partition format not yet supported in fused partition-execute: " + _dpf);
    }
    _info = MRJobConfiguration.getPartitioningOutputInfo(job);
    _tSparseCol = MRJobConfiguration.getPartitioningTransposedCol(job);
    if (_tSparseCol)
        _partition = new MatrixBlock((int) _clen, _rlen, true);
    else
        _partition = new MatrixBlock((int) _rlen, _clen, false);

    //Step 1: configure parworker
    String taskID = job.get("mapred.tip.id");
    LOG.trace("configure RemoteDPParWorkerReducer " + taskID);

    try {
        _stringID = taskID;
        _workerID = IDHandler.extractIntID(_stringID); //int task ID

        //use the given job configuration as source for all new job confs 
        //NOTE: this is required because on HDP 2.3, the classpath of mr tasks contained hadoop-common.jar 
        //which includes a core-default.xml configuration which hides the actual default cluster configuration
        //in the context of mr jobs (for example this config points to local fs instead of hdfs by default). 
        if (!InfrastructureAnalyzer.isLocalMode(job)) {
            ConfigurationManager.setCachedJobConf(job);
        }

        //create local runtime program
        String in = MRJobConfiguration.getProgramBlocks(job);
        ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
        _childBlocks = body.getChildBlocks();
        _ec = body.getEc();
        _resultVars = body.getResultVarNames();

        //init local cache manager 
        if (!CacheableData.isCachingActive()) {
            String uuid = IDHandler.createDistributedUniqueID();
            LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
            CacheableData.initCaching(uuid); //incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
        }
        if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) { //account for local mode
            CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_"
                    + _workerID;
        }

        //ensure that resultvar files are not removed
        super.pinResultVariables();

        //enable/disable caching (if required)
        boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
        if (!cpCaching)
            CacheableData.disableCaching();

        _numTasks = 0;
        _numIters = 0;
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }

    //disable parfor stat monitoring, reporting execution times via counters not useful 
    StatisticMonitor.disableStatMonitoring();

    //always reset stats because counters per map task (for case of JVM reuse)
    if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) {
        CacheStatistics.reset();
        Statistics.reset();
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java

License:Open Source License

/**
 * /*ww  w. java 2 s  .com*/
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param _enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile,
        MatrixObject colocatedDPMatrixObj, //inputs
        boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the mapper
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //set mappers, reducers, combiners
        job.setMapperClass(RemoteParWorkerMapper.class); //map-only

        //set input format (one split per row, NLineInputFormat default N=1)
        if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
            job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
            MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
            MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
            MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
            MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
            MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
        } else //default case 
        {
            job.setInputFormat(NLineInputFormat.class);
        }

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, new Path(taskFile));

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumMapTasks(numMappers); //numMappers
        job.setNumReduceTasks(0);
        //job.setInt("mapred.map.tasks.maximum", 1); //system property
        //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
        //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.priority", 0); //highest

            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", numMappers);
        }

        //set jvm memory size (if require)
        String memKey = "mapred.child.java.opts";
        if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
            InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
            LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
        job.setInt("io.sort.mb", 8); //8MB

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParWorkerMapper.java

License:Open Source License

/**
 * /*ww  w.j a  v  a 2 s .c  o  m*/
 */
@Override
public void configure(JobConf job) {
    boolean requiresConfigure = true;
    String jobID = job.get("mapred.job.id");

    //probe cache for existing worker (parfor body, symbol table, etc)
    if (ParForProgramBlock.ALLOW_REUSE_MR_PAR_WORKER) {
        synchronized (_sCache) //for multiple jobs in local mode
        {
            if (_sCache.containsKey(jobID)) {
                RemoteParWorkerMapper tmp = _sCache.get(jobID);

                _stringID = tmp._stringID;
                _workerID = tmp._workerID;

                _childBlocks = tmp._childBlocks;
                _resultVars = tmp._resultVars;
                _ec = tmp._ec;

                _numIters = tmp._numIters;
                _numTasks = tmp._numTasks;

                _rvarFnames = tmp._rvarFnames;

                requiresConfigure = false;
            }
        }
    }

    if (requiresConfigure) {
        LOG.trace("configure RemoteParWorkerMapper " + job.get("mapred.tip.id"));

        try {
            _stringID = job.get("mapred.tip.id"); //task ID
            _workerID = IDHandler.extractIntID(_stringID); //int task ID

            //use the given job configuration as source for all new job confs 
            //NOTE: this is required because on HDP 2.3, the classpath of mr tasks contained hadoop-common.jar 
            //which includes a core-default.xml configuration which hides the actual default cluster configuration
            //in the context of mr jobs (for example this config points to local fs instead of hdfs by default). 
            if (!InfrastructureAnalyzer.isLocalMode(job)) {
                ConfigurationManager.setCachedJobConf(job);
            }

            //create local runtime program
            String in = MRJobConfiguration.getProgramBlocks(job);
            ParForBody body = ProgramConverter.parseParForBody(in, (int) _workerID);
            _childBlocks = body.getChildBlocks();
            _ec = body.getEc();
            _resultVars = body.getResultVarNames();

            //init local cache manager 
            if (!CacheableData.isCachingActive()) {
                String uuid = IDHandler.createDistributedUniqueID();
                LocalFileUtils.createWorkingDirectoryWithUUID(uuid);
                CacheableData.initCaching(uuid); //incl activation, cache dir creation (each map task gets its own dir for simplified cleanup)
            }
            if (!CacheableData.cacheEvictionLocalFilePrefix.contains("_")) { //account for local mode
                CacheableData.cacheEvictionLocalFilePrefix = CacheableData.cacheEvictionLocalFilePrefix + "_"
                        + _workerID;
            }

            //ensure that resultvar files are not removed
            super.pinResultVariables();

            //enable/disable caching (if required)
            boolean cpCaching = MRJobConfiguration.getParforCachingConfig(job);
            if (!cpCaching)
                CacheableData.disableCaching();

            _numTasks = 0;
            _numIters = 0;

        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }

        //disable stat monitoring, reporting execution times via counters not useful 
        StatisticMonitor.disableStatMonitoring();

        //put into cache if required
        if (ParForProgramBlock.ALLOW_REUSE_MR_PAR_WORKER)
            synchronized (_sCache) { //for multiple jobs in local mode
                _sCache.put(jobID, this);
            }
    } else {
        LOG.trace("reuse configured RemoteParWorkerMapper " + _stringID);
    }

    //always reset stats because counters per map task (for case of JVM reuse)
    if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) {
        CacheStatistics.reset();
        Statistics.reset();
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMapper.java

License:Open Source License

public void configure(JobConf job) {
    InputInfo ii = MRJobConfiguration.getResultMergeInputInfo(job);
    long[] tmp = MRJobConfiguration.getResultMergeMatrixCharacteristics(job);
    String compareFname = MRJobConfiguration.getResultMergeInfoCompareFilename(job);
    String currentFname = job.get("map.input.file");

    byte tag = 0;
    //startsWith comparison in order to account for part names in currentFname
    if (currentFname.startsWith(compareFname))
        tag = ResultMergeRemoteMR.COMPARE_TAG;
    else//from   w  ww .  jav a  2  s.com
        tag = ResultMergeRemoteMR.DATA_TAG;

    if (ii == InputInfo.TextCellInputInfo)
        _mapper = new ResultMergeMapperTextCell(tag);
    else if (ii == InputInfo.BinaryCellInputInfo)
        _mapper = new ResultMergeMapperBinaryCell(tag);
    else if (ii == InputInfo.BinaryBlockInputInfo)
        _mapper = new ResultMergeMapperBinaryBlock(tag, tmp[0], tmp[1], tmp[2], tmp[3]);
    else
        throw new RuntimeException("Unable to configure mapper with unknown input info: " + ii.toString());
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer.java

License:Open Source License

/**
 * /*w w w  .  java 2s .c  om*/
 * @param job
 * @param key
 * @param bytes
 */
public static void setMaxMemoryOpt(JobConf job, String key, long bytes) {
    String javaOptsOld = job.get(key);
    String javaOptsNew = null;

    //StringTokenizer st = new StringTokenizer( javaOptsOld, " " );
    String[] tokens = javaOptsOld.split(" "); //account also for no ' '
    StringBuilder sb = new StringBuilder();
    for (String arg : tokens) {
        if (arg.startsWith("-Xmx")) //search for max mem
        {
            sb.append("-Xmx");
            sb.append((bytes / (1024 * 1024)));
            sb.append("M");
        } else
            sb.append(arg);

        sb.append(" ");
    }
    javaOptsNew = sb.toString().trim();
    job.set(key, javaOptsNew);
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer.java

License:Open Source License

/**
 * Analyzes only properties of hadoop configuration in order to prevent 
 * expensive call to cluster status ./*from   www.jav  a 2s.  c  o m*/
 */
private static void analyzeHadoopConfiguration() {
    JobConf job = ConfigurationManager.getCachedJobConf();

    _remoteMRSortMem = (1024 * 1024) * job.getLong("io.sort.mb", 100); //1MB

    //handle jvm max mem (map mem budget is relevant for map-side distcache and parfor)
    //(for robustness we probe both: child and map configuration parameters)
    String javaOpts1 = job.get("mapred.child.java.opts"); //internally mapred/mapreduce synonym
    String javaOpts2 = job.get("mapreduce.map.java.opts", null); //internally mapred/mapreduce synonym
    String javaOpts3 = job.get("mapreduce.reduce.java.opts", null); //internally mapred/mapreduce synonym
    if (javaOpts2 != null) //specific value overrides generic
        _remoteJVMMaxMemMap = extractMaxMemoryOpt(javaOpts2);
    else
        _remoteJVMMaxMemMap = extractMaxMemoryOpt(javaOpts1);
    if (javaOpts3 != null) //specific value overrides generic
        _remoteJVMMaxMemReduce = extractMaxMemoryOpt(javaOpts3);
    else
        _remoteJVMMaxMemReduce = extractMaxMemoryOpt(javaOpts1);

    //HDFS blocksize
    String blocksize = job.get(MRConfigurationNames.DFS_BLOCK_SIZE, "134217728");
    _blocksize = Long.parseLong(blocksize);

    //is yarn enabled
    String framework = job.get("mapreduce.framework.name");
    _yarnEnabled = (framework != null && framework.equals("yarn"));

    //analyze if local mode (internally requires yarn_enabled)
    _localJT = analyzeLocalMode(job);
}

From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} to the list of
 * inputs for the map-reduce job./*from  w  w w.  ja v a  2 s . c o  m*/
 * 
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass) {

    String inputFormatMapping = path.toString() + ";" + inputFormatClass.getName();
    String inputFormats = conf.get("mapred.input.dir.formats");
    conf.set("mapred.input.dir.formats",
            inputFormats == null ? inputFormatMapping : inputFormats + "," + inputFormatMapping);

    conf.setInputFormat(DelegatingInputFormat.class);
}

From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java

License:Apache License

/**
 * Add a {@link Path} with a custom {@link InputFormat} and
 * {@link Mapper} to the list of inputs for the map-reduce job.
 * //ww w .j  a va2s .c o m
 * @param conf The configuration of the job
 * @param path {@link Path} to be added to the list of inputs for the job
 * @param inputFormatClass {@link InputFormat} class to use for this path
 * @param mapperClass {@link Mapper} class to use for this path
 */
public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass,
        Class<? extends Mapper> mapperClass) {

    addInputPath(conf, path, inputFormatClass);

    String mapperMapping = path.toString() + ";" + mapperClass.getName();
    String mappers = conf.get("mapred.input.dir.mappers");
    conf.set("mapred.input.dir.mappers", mappers == null ? mapperMapping : mappers + "," + mapperMapping);

    conf.setMapperClass(DelegatingMapper.class);
}

From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java

License:Apache License

/**
 * Retrieves a map of {@link Path}s to the {@link InputFormat} class
 * that should be used for them.//  w  ww  . ja va  2 s .co m
 * 
 * @param conf The confuration of the job
 * @see #addInputPath(JobConf, Path, Class)
 * @return A map of paths to inputformats for the job
 */
static Map<Path, InputFormat> getInputFormatMap(JobConf conf) {
    Map<Path, InputFormat> m = new HashMap<Path, InputFormat>();
    String[] pathMappings = conf.get("mapred.input.dir.formats").split(",");
    for (String pathMapping : pathMappings) {
        String[] split = pathMapping.split(";");
        InputFormat inputFormat;
        try {
            inputFormat = (InputFormat) ReflectionUtils.newInstance(conf.getClassByName(split[1]), conf);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
        m.put(new Path(split[0]), inputFormat);
    }
    return m;
}