Example usage for org.apache.hadoop.mapred JobConf setNumTasksToExecutePerJvm

List of usage examples for org.apache.hadoop.mapred JobConf setNumTasksToExecutePerJvm

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumTasksToExecutePerJvm.

Prototype

public void setNumTasksToExecutePerJvm(int numTasks) 

Source Link

Document

Sets the number of tasks that a spawned task JVM should run before it exits

Usage

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java

License:Open Source License

@Override
protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen,
        long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-DPMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(DataPartitionerRemoteMR.class);
    if (_pfid >= 0) //use in parfor
        job.setJobName(jobname + _pfid);
    else //use for partition instruction
        job.setJobName("Partition-MR");

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {/*from  w  w w . j  a v  a 2 s.c o  m*/
        //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size)
        in.exportData(); //written to disk iff dirty

        Path path = new Path(in.getFileName());

        /////
        //configure the MR job
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew,
                _keepIndexes);

        //set mappers, reducers, combiners
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(DataPartitionerRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            //binary cell intermediates for reduced IO 
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableBlock.class);

            //check Alignment
            if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0)
                    || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) {
                throw new DMLRuntimeException(
                        "Data partitioning format " + _format + " requires aligned blocks.");
            }
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, path);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        //FileOutputFormat.setOutputPath(job, pathNew);
        job.setOutputFormat(NullOutputFormat.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = -1;
        switch (_format) {
        case ROW_WISE:
            reducerGroups = rlen;
            break;
        case COLUMN_WISE:
            reducerGroups = clen;
            break;
        case ROW_BLOCK_WISE:
            reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE:
            reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1);
            break;
        case ROW_BLOCK_WISE_N:
            reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE_N:
            reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1);
            break;
        default:
            //do nothing
        }
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        /*if( ParForProgramBlock.USE_FLEX_SCHEDULER_CONF )
        {
           job.setInt("flex.map.min", 0);
           job.setInt("flex.map.max", _numMappers);
           job.setInt("flex.reduce.min", 0);
           job.setInt("flex.reduce.max", _numMappers);
        }*/

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   
        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS && _pfid >= 0) {
        long t1 = System.nanoTime(); //only for parfor 
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java

License:Open Source License

/**
 * //from   www  . ja  va 2  s  .  c  om
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program,
        String resultFile, MatrixObject input, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params
        boolean enableCPCaching, int numReducers, int replication, int max_retry) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-DPEMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteDPParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the reducers
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //setup input matrix
        Path path = new Path(input.getFileName());
        long rlen = input.getNumRows();
        long clen = input.getNumColumns();
        int brlen = (int) input.getNumRowsPerBlock();
        int bclen = (int) input.getNumColumnsPerBlock();
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo,
                oi, dpf, 1, input.getFileName(), itervar, matrixvar, tSparseCol);
        job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
        FileInputFormat.setInputPaths(job, path);

        //set mapper and reducers classes
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(RemoteDPParWorkerReducer.class);

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema

        //parfor partitioning outputs (intermediates)
        job.setMapOutputKeyClass(LongWritable.class);
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            job.setMapOutputValueClass(PairWritableBlock.class);
        else if (oi == OutputInfo.BinaryCellOutputInfo)
            job.setMapOutputValueClass(PairWritableCell.class);
        else
            throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
        //parfor exec output
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumReduceTasks(numReducers);

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //disable JVM reuse
        job.setNumTasksToExecutePerJvm(1); //-1 for unlimited 

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //note: currently disabled to use cluster config
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java

License:Open Source License

/**
 * //  w w w  .  j  a v a  2  s.  c  o  m
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param _enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile,
        MatrixObject colocatedDPMatrixObj, //inputs
        boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the mapper
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //set mappers, reducers, combiners
        job.setMapperClass(RemoteParWorkerMapper.class); //map-only

        //set input format (one split per row, NLineInputFormat default N=1)
        if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
            job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
            MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
            MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
            MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
            MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
            MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
        } else //default case 
        {
            job.setInputFormat(NLineInputFormat.class);
        }

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, new Path(taskFile));

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumMapTasks(numMappers); //numMappers
        job.setNumReduceTasks(0);
        //job.setInt("mapred.map.tasks.maximum", 1); //system property
        //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
        //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.priority", 0); //highest

            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", numMappers);
        }

        //set jvm memory size (if require)
        String memKey = "mapred.child.java.opts";
        if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
            InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
            LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
        job.setInt("io.sort.mb", 8); //8MB

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java

License:Open Source License

/**
 * /* www . ja  va 2  s  .  c om*/
 * @param fname    null if no comparison required
 * @param fnameNew
 * @param srcFnames
 * @param ii
 * @param oi
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi,
        long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(ResultMergeRemoteMR.class);
    job.setJobName(jobname + _pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    //warning for textcell/binarycell without compare
    boolean withCompare = (fname != null);
    if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare
            && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
        LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi)
                + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");

    try {
        Path pathCompare = null;
        Path pathNew = new Path(fnameNew);

        /////
        //configure the MR job
        if (withCompare) {
            pathCompare = new Path(fname).makeQualified(FileSystem.get(job));
            MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen,
                    bclen);
        } else
            MRJobConfiguration.setResultMergeInfo(job, "null", ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen,
                    bclen);

        //set mappers, reducers, combiners
        job.setMapperClass(ResultMergeRemoteMapper.class);
        job.setReducerClass(ResultMergeRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            //setup partitioning, grouping, sorting for composite key (old API)
            job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning
            job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping
            job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting

            job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixBlock.class);
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path 
        Path[] paths = null;
        if (withCompare) {
            paths = new Path[srcFnames.length + 1];
            paths[0] = pathCompare;
            for (int i = 1; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i - 1]);
        } else {
            paths = new Path[srcFnames.length];
            for (int i = 0; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i]);
        }
        FileInputFormat.setInputPaths(job, paths);

        //set output format
        job.setOutputFormat(oi.outputFormatClass);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        FileOutputFormat.setOutputPath(job, pathNew);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = _numReducers;
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
        else //textcell/binarycell
            reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", _numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", _numMappers);
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   

        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.GMR.java

License:Open Source License

/**
 * inBlockRepresentation: indicate whether to use block representation or cell representation
 * inputs: input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
 * inputInfos: the input format information for the input matrices
 * rlen: the number of rows for each matrix
 * clen: the number of columns for each matrix
 * brlen: the number of rows per block// w  w  w.  j a v a2  s.co  m
 * bclen: the number of columns per block
 * instructionsInMapper: in Mapper, the set of unary operations that need to be performed on each input matrix
 * aggInstructionsInReducer: in Reducer, right after sorting, the set of aggreagte operations that need 
 *                      to be performed on each input matrix, 
 * otherInstructionsInReducer: the mixed operations that need to be performed on matrices after the aggregate operations
 * numReducers: the number of reducers
 * replication: the replication factor for the output
 * resulltIndexes: the indexes of the result matrices that needs to be outputted.
 * outputs: the names for the output directories, one for each result index
 * outputInfos: output format information for the output matrices
 */

@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats,
        int[] psizes, String recordReaderInstruction, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs,
        OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GMR.class);
    job.setJobName("G-MR");

    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
        assert (inputs.length <= 2);
        PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction
                .parseInstruction(recordReaderInstruction);
        PickFromCompactInputFormat.setKeyValueClasses(job,
                (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass,
                inputInfos[ins.input1].inputValueClass);
        job.setInputFormat(PickFromCompactInputFormat.class);
        PickFromCompactInputFormat.setZeroValues(job,
                (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata);

        if (ins.isValuePick) {
            double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2],
                    rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
            PickFromCompactInputFormat.setPickRecordsInEachPartFile(job,
                    (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, probs);

            realinputs = new String[inputs.length - 1];
            realinputInfos = new InputInfo[inputs.length - 1];
            realrlens = new long[inputs.length - 1];
            realclens = new long[inputs.length - 1];
            realbrlens = new int[inputs.length - 1];
            realbclens = new int[inputs.length - 1];
            realIndexes = new byte[inputs.length - 1];
            byte realIndex = 0;
            for (byte i = 0; i < inputs.length; i++) {
                if (i == ins.input2)
                    continue;
                realinputs[realIndex] = inputs[i];
                realinputInfos[realIndex] = inputInfos[i];
                if (i == ins.input1) {
                    realrlens[realIndex] = rlens[ins.input2];
                    realclens[realIndex] = clens[ins.input2];
                    realbrlens[realIndex] = 1;
                    realbclens[realIndex] = 1;
                    realIndexes[realIndex] = ins.output;
                } else {
                    realrlens[realIndex] = rlens[i];
                    realclens[realIndex] = clens[i];
                    realbrlens[realIndex] = brlens[i];
                    realbclens[realIndex] = bclens[i];
                    realIndexes[realIndex] = i;
                }
                realIndex++;
            }

        } else {
            //PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
            PickFromCompactInputFormat.setRangePickPartFiles(job,
                    (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
            realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile(
                    (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst);
            realclens[ins.input1] = clens[ins.input1];
            realbrlens[ins.input1] = 1;
            realbclens[ins.input1] = 1;
            realIndexes[ins.input1] = ins.output;
        }
    }

    setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer, realinputs, realrlens,
            realclens);

    //set up the input files and their format information
    boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper,
            aggInstructionsInReducer, otherInstructionsInReducer);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens,
            distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    MRJobConfiguration.setInputPartitioningInfo(job, pformats);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

    //set up the instructions that will happen in the reducer, after the aggregation instructions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

    //set up jvm reuse (incl. reuse of loaded dist cache matrices)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes,
            mapoutputIndexes, false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }
    //MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos,
            inBlockRepresentation, true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
    } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
    }

    //set up combiner
    if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
        job.setCombinerClass(GMRCombiner.class);
    }

    //configure reducer
    job.setReducerClass(GMRReducer.class);
    //job.setReducerClass(PassThroughReducer.class);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    //MatrixCharacteristics[] stats=new MatrixCharacteristics[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }

    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.matrix.ReblockMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, long[] nnz, String instructionsInMapper,
        String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication,
        boolean jvmReuse, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(ReblockMR.class);
    job.setJobName("Reblock-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;// ww w  . java2 s.  co  m

    //set up the input files and their format information
    //(internally used input converters: text2bc for text, identity for binary inputs)
    MRJobConfiguration.setUpMultipleInputsReblock(job, realIndexes, inputs, inputInfos, brlens, bclens);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens, nnz);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setReblockInstructions(job, reblockInstructions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //disable automatic tasks timeouts and speculative task exec
    job.setInt("mapred.task.timeout", 0);
    job.setMapSpeculativeExecution(false);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //enable jvm reuse (based on SystemML configuration)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, reblockInstructions, null, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes,
            mapoutputIndexes, false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers (according to output size)
    int numRed = determineNumReducers(rlens, clens, nnz,
            ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
    job.setNumReduceTasks(numRed);

    //setup in-memory reduce buffers budget (reblock reducer dont need much memory)
    //job.set("mapred.job.reduce.input.buffer.percent", "0.70");

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            resultDimsUnknown[i] = (byte) 1;
        } else {
            resultDimsUnknown[i] = (byte) 0;
        }
    }

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ReblockMapper.class);
    job.setMapOutputKeyClass(MatrixIndexes.class); //represent key offsets for block
    job.setMapOutputValueClass(TaggedAdaptivePartialBlock.class); //binary cell/block

    //configure reducer
    job.setReducerClass(ReblockReducer.class);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.

    // at this point, both reblock_binary and reblock_text are similar
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        //   System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:hadoop.UIUCWikifierAppHadoop.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    JobConf job = new JobConf(conf, UIUCWikifierAppHadoop.class);

    //      System.out.println("Run.. Envinronment Variables");
    //      java.util.Map<String,String> env = System.getenv();
    //// w  w w . ja v a 2 s . co  m
    //      System.out.println("Printing environment variables");
    //      for(String k : env.keySet()){
    //         System.out.println(k + "\t" + env.get(k));
    //      }
    //      String jlpValue = System.getProperty("java.library.path");
    //      System.out.println("java.library.path=" + jlpValue);
    //      System.setProperty("java.library.path", jlpValue + ":" + "/home/jgilme1/bin/gurobi550/linux64/lib");

    //process command line options
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    //change current working directory to hdfs path..
    job.setJobName("entitylinker");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(DistributeInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setMapperClass(Map.class);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setNumReduceTasks(0);
    job.setNumMapTasks(Integer.parseInt(args[2]));
    job.set("mapreduce.input.fileinputformat.split.minsize", "0");
    job.set("mapred.child.java.opts", "-Xmx16g");
    job.setNumTasksToExecutePerJvm(-1);
    //job.setMemoryForMapTask(new Long(12288));
    //job.set(JobConf.MAPRED_MAP_TASK_ULIMIT, "12582912");

    String gurobiHomeVariable = "GUROBI_HOME";
    String gurobiHomeValue = "/home/jgilme1/bin/gurobi560/linux64";
    String pathVariable = "PATH";
    String newPathValue = gurobiHomeValue + "/bin";
    String ldLibraryPathVariable = "LD_LIBRARY_PATH";
    String ldLibraryPathValue = gurobiHomeValue + "/lib";
    String grbLicenseFileVariable = "GRB_LICENSE_FILE";
    String grbLicenseFileValue = "/scratch6/usr/jgilme1/gurobiLicense/gurobi.lic";

    StringBuilder newEnvironment = new StringBuilder();
    newEnvironment.append(gurobiHomeVariable);
    newEnvironment.append("=");
    newEnvironment.append(gurobiHomeValue);
    newEnvironment.append(",");
    newEnvironment.append(pathVariable);
    newEnvironment.append("=");
    newEnvironment.append("$" + pathVariable + ":");
    newEnvironment.append(newPathValue);
    newEnvironment.append(",");
    newEnvironment.append(ldLibraryPathVariable);
    newEnvironment.append("=$" + ldLibraryPathVariable + ":");
    newEnvironment.append(ldLibraryPathValue);
    newEnvironment.append(",");
    newEnvironment.append(grbLicenseFileVariable);
    newEnvironment.append("=");
    newEnvironment.append(grbLicenseFileValue);

    //System.out.println(newEnvironment.toString());
    job.set(JobConf.MAPRED_MAP_TASK_ENV, newEnvironment.toString());

    DistributedCache.addCacheArchive(new URI("/user/jgilme1/entitylinking/Wikifier2013.tar.gz"), job);

    JobClient.runJob(job);
    return 0;
}

From source file:map_reduce.MapReduce_OptimizedBrandesAdditions_DO_JUNG.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override//from   w w w .j a  v a 2  s .  c  om
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage:\n");
        System.exit(1);
    }

    //       Job job = new Job(super.getConf());

    //      READ IN ALL COMMAND LINE ARGUMENTS
    //      EXAMPLE: 
    // hadoop jar MapReduce_OptimizedBrandesAdditions_DO_JUNG.jar
    // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar
    // -Dmapred.job.map.memory.mb=4096
    // -Dmapred.job.reduce.memory.mb=4096
    // -Dmapred.child.java.opts=-Xmx3500m
    // -Dmapreduce.task.timeout=60000000
    // -Dmapreduce.job.queuename=QUEUENAME
    // input_iterbrandes_additions_nocomb_10k_1 output_iterbrandes_additions_nocomb_10k_1
    // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/

    int m = -1;

    // input path to use on hdfs
    Path inputPath = new Path(args[++m]);

    // output path to use on hdfs
    Path outputPath = new Path(args[++m]);

    // number of Mappers to split the sources: e.g., 1, 10, 100 etc.
    // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number.
    int numOfMaps = Integer.parseInt(args[++m]);

    // number of Reducers to collect the output
    int numOfReduce = Integer.parseInt(args[++m]);

    // Number of vertices in graph
    int N = Integer.parseInt(args[++m]);

    // Number of edges in graph
    int M = Integer.parseInt(args[++m]);

    // Graph file (edge list, tab delimited) (full path)
    String graph = args[++m];

    // File with edges to be added (tab delimited) (full path)
    // Note: this version handles only edges between existing vertices in the graph.
    String random_edges = args[++m];

    // Number of random edges added
    int re = Integer.parseInt(args[++m]);

    // Experiment iteration (in case of multiple experiments)
    int iter = Integer.parseInt(args[++m]);

    // Use combiner or not (true/false)
    Boolean comb = Boolean.valueOf(args[++m]);

    // Output path for file with stats
    String statsoutputpath = args[++m];

    // Output path for file with final betweenness values
    String betoutputpath = args[++m];

    //      BEGIN INITIALIZATION

    JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesAdditions_DO_JUNG.class);
    FileSystem fs = FileSystem.get(conf);

    String setup = "_additions_edges" + re + "_maps" + numOfMaps + "_comb" + comb;
    conf.setJobName("OptimizedBrandesAdditionsDOJung_" + graph + setup + "_" + iter);
    conf.set("HDFS_GRAPH", graph + setup);
    conf.set("HDFS_Random_Edges", random_edges + setup);
    conf.set("output", outputPath.getName());
    conf.set("setup", setup);

    //      CREATE INPUT FILES FOR MAPPERS

    int numOfTasksperMap = (int) Math.ceil(N / numOfMaps);
    //generate an input file for each map task
    for (int i = 0; i < numOfMaps - 1; i++) {
        Path file = new Path(inputPath, "part-r-" + i);
        IntWritable start = new IntWritable(i * numOfTasksperMap);
        IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1);

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class,
                IntWritable.class, CompressionType.NONE);
        try {
            writer.append(start, end);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end);
    }

    // last mapper takes what is left
    Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1));
    IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap);
    IntWritable end = new IntWritable(N - 1);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class,
            CompressionType.NONE);
    try {
        writer.append(start, end);
    } finally {
        writer.close();
    }
    System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end);

    //      COPY FILES TO MAPPERS
    System.out.println("Copying graph to cache");
    String LOCAL_GRAPH = graph;
    Path hdfsPath = new Path(graph + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    System.out.println("Copying random edges to cache");
    String LOCAL_Random_Edges = random_edges;
    hdfsPath = new Path(random_edges + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(IterBrandesMapper.class);
    conf.setNumMapTasks(numOfMaps);

    if (comb)
        conf.setCombinerClass(IterBrandesReducer.class);

    conf.setReducerClass(IterBrandesReducer.class);
    conf.setNumReduceTasks(numOfReduce);

    // turn off speculative execution, because DFS doesn't handle multiple writers to the same file.
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // conf.set("mapred.job.name", "APS-" + outputPath.getName());
    conf.setNumTasksToExecutePerJvm(-1); // JVM reuse

    System.out.println("Starting the execution...! Pray!! \n");
    long time1 = System.nanoTime();
    RunningJob rj = JobClient.runJob(conf);
    long time2 = System.nanoTime();

    //      READ OUTPUT FILES

    System.out.println("\nFinished and now reading/writing Betweenness Output...\n");

    // Assuming 1 reducer.
    Path inFile = new Path(outputPath, "part-00000");
    IntWritable id = new IntWritable();
    DoubleWritable betweenness = new DoubleWritable();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf);

    FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter));
    try {
        int i = 0;
        for (; i < (N + M + re); i++) {
            reader.next(id, betweenness);
            fw.write(id + "\t" + betweenness + "\n");
            fw.flush();
        }
    } finally {
        reader.close();
        fw.close();
    }

    System.out.println("\nWriting times Output...\n");

    fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter));

    fw.write("Total-time:\t" + (time2 - time1) + "\n");
    fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_MAPS") + "\n");
    fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_REDUCES") + "\n");
    fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("CPU_MILLISECONDS") + "\n");
    fw.write("total-gc-mr\t"
            + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS")
            + "\n");
    fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("PHYSICAL_MEMORY_BYTES") + "\n");
    fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("VIRTUAL_MEMORY_BYTES") + "\n");
    fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes")
            + "\n");
    fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n");
    fw.flush();

    try {
        Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator();
        while (counters.hasNext()) {
            Counter cc = counters.next();
            fw.write(cc.getName() + "\t" + cc.getCounter() + "\n");
            fw.flush();
        }
    } finally {
        fw.close();
    }

    return 0;
}

From source file:map_reduce.MapReduce_OptimizedBrandesDeletions_DO_JUNG.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override//from   ww w .  j  a va2 s  .c  o m
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage:\n");
        System.exit(1);
    }

    //       Job job = new Job(super.getConf());

    //      READ IN ALL COMMAND LINE ARGUMENTS
    //      EXAMPLE: 
    // hadoop jar MapReduce_OptimizedBrandesDeletions_DO_JUNG.jar
    // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar
    // -Dmapred.job.map.memory.mb=4096
    // -Dmapred.job.reduce.memory.mb=4096
    // -Dmapred.child.java.opts=-Xmx3500m
    // -Dmapreduce.task.timeout=60000000
    // -Dmapreduce.job.queuename=QUEUENAME
    // input_iterbrandes_deletions_nocomb_10k_1 output_iterbrandes_deletions_nocomb_10k_1
    // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/

    int m = -1;

    // input path to use on hdfs
    Path inputPath = new Path(args[++m]);

    // output path to use on hdfs
    Path outputPath = new Path(args[++m]);

    // number of Mappers to split the sources: e.g., 1, 10, 100 etc.
    // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number.
    int numOfMaps = Integer.parseInt(args[++m]);

    // number of Reducers to collect the output
    int numOfReduce = Integer.parseInt(args[++m]);

    // Number of vertices in graph
    int N = Integer.parseInt(args[++m]);

    // Number of edges in graph
    int M = Integer.parseInt(args[++m]);

    // Graph file (edge list, tab delimited) (full path)
    String graph = args[++m];

    // File with edges to be added (tab delimited) (full path)
    // Note: this version handles only edges between existing vertices in the graph.
    String random_edges = args[++m];

    // Number of random edges added
    int re = Integer.parseInt(args[++m]);

    // Experiment iteration (in case of multiple experiments)
    int iter = Integer.parseInt(args[++m]);

    // Use combiner or not (true/false)
    Boolean comb = Boolean.valueOf(args[++m]);

    // Output path for file with stats
    String statsoutputpath = args[++m];

    // Output path for file with final betweenness values
    String betoutputpath = args[++m];

    //      BEGIN INITIALIZATION

    JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesDeletions_DO_JUNG.class);
    FileSystem fs = FileSystem.get(conf);

    String setup = "_deletions_edges" + re + "_maps" + numOfMaps + "_comb" + comb;
    conf.setJobName("OptimizedBrandesDeletionsDOJung_" + graph + setup + "_" + iter);
    conf.set("HDFS_GRAPH", graph + setup);
    conf.set("HDFS_Random_Edges", random_edges + setup);
    conf.set("output", outputPath.getName());
    conf.set("setup", setup);

    //      CREATE INPUT FILES FOR MAPPERS

    int numOfTasksperMap = (int) Math.ceil(N / numOfMaps);
    //generate an input file for each map task
    for (int i = 0; i < numOfMaps - 1; i++) {
        Path file = new Path(inputPath, "part-r-" + i);
        IntWritable start = new IntWritable(i * numOfTasksperMap);
        IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1);

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class,
                IntWritable.class, CompressionType.NONE);
        try {
            writer.append(start, end);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end);
    }

    // last mapper takes what is left
    Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1));
    IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap);
    IntWritable end = new IntWritable(N - 1);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class,
            CompressionType.NONE);
    try {
        writer.append(start, end);
    } finally {
        writer.close();
    }
    System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end);

    //      COPY FILES TO MAPPERS
    System.out.println("Copying graph to cache");
    String LOCAL_GRAPH = graph;
    Path hdfsPath = new Path(graph + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    System.out.println("Copying random edges to cache");
    String LOCAL_Random_Edges = random_edges;
    hdfsPath = new Path(random_edges + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(IterBrandesMapper.class);
    conf.setNumMapTasks(numOfMaps);

    if (comb)
        conf.setCombinerClass(IterBrandesReducer.class);

    conf.setReducerClass(IterBrandesReducer.class);
    conf.setNumReduceTasks(numOfReduce);

    // turn off speculative execution, because DFS doesn't handle multiple writers to the same file.
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // conf.set("mapred.job.name", "APS-" + outputPath.getName());
    conf.setNumTasksToExecutePerJvm(-1); // JVM reuse

    System.out.println("Starting the execution...! Pray!! \n");
    long time1 = System.nanoTime();
    RunningJob rj = JobClient.runJob(conf);
    long time2 = System.nanoTime();

    //      READ OUTPUT FILES

    System.out.println("\nFinished and now reading/writing Betweenness Output...\n");

    // Assuming 1 reducer.
    Path inFile = new Path(outputPath, "part-00000");
    IntWritable id = new IntWritable();
    DoubleWritable betweenness = new DoubleWritable();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf);

    FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter));
    try {
        int i = 0;
        for (; i < (N + (M - re)); i++) {
            reader.next(id, betweenness);
            fw.write(id + "\t" + betweenness + "\n");
            fw.flush();
        }
    } finally {
        reader.close();
        fw.close();
    }

    System.out.println("\nWriting times Output...\n");

    fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter));

    fw.write("Total-time:\t" + (time2 - time1) + "\n");
    fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_MAPS") + "\n");
    fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_REDUCES") + "\n");
    fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("CPU_MILLISECONDS") + "\n");
    fw.write("total-gc-mr\t"
            + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS")
            + "\n");
    fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("PHYSICAL_MEMORY_BYTES") + "\n");
    fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("VIRTUAL_MEMORY_BYTES") + "\n");
    fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes")
            + "\n");
    fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n");
    fw.flush();

    try {
        Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator();
        while (counters.hasNext()) {
            Counter cc = counters.next();
            fw.write(cc.getName() + "\t" + cc.getCounter() + "\n");
            fw.flush();
        }
    } finally {
        fw.close();
    }

    return 0;
}

From source file:org.apache.sysml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java

License:Apache License

@Override
protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen,
        long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-DPMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(DataPartitionerRemoteMR.class);
    if (_pfid >= 0) //use in parfor
        job.setJobName(jobname + _pfid);
    else //use for partition instruction
        job.setJobName("Partition-MR");

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {//from ww  w . ja v  a 2s  .  c  o  m
        //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size)
        in.exportData(); //written to disk iff dirty

        Path path = new Path(in.getFileName());

        /////
        //configure the MR job
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew,
                _keepIndexes);

        //set mappers, reducers, combiners
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(DataPartitionerRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            //binary cell intermediates for reduced IO 
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableBlock.class);

            //check Alignment
            if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0)
                    || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) {
                throw new DMLRuntimeException(
                        "Data partitioning format " + _format + " requires aligned blocks.");
            }
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, path);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        //FileOutputFormat.setOutputPath(job, pathNew);
        job.setOutputFormat(NullOutputFormat.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = -1;
        switch (_format) {
        case ROW_WISE:
            reducerGroups = rlen;
            break;
        case COLUMN_WISE:
            reducerGroups = clen;
            break;
        case ROW_BLOCK_WISE:
            reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE:
            reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1);
            break;
        case ROW_BLOCK_WISE_N:
            reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE_N:
            reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1);
            break;
        default:
            //do nothing
        }
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true");
        //job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   
        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS && _pfid >= 0) {
        long t1 = System.nanoTime(); //only for parfor 
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}