Example usage for org.apache.hadoop.mapred RunningJob isSuccessful

List of usage examples for org.apache.hadoop.mapred RunningJob isSuccessful

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred RunningJob isSuccessful.

Prototype

public boolean isSuccessful() throws IOException;

Source Link

Document

Check if the job completed successfully.

Usage

From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteDPParForMR.java

License:Apache License

public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program,
        String resultFile, MatrixObject input, PartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params
        boolean enableCPCaching, int numReducers, int replication) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;/* ww w  .  j  a  v a2  s  .c  o  m*/
    String jobname = "ParFor-DPEMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteDPParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the reducers
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //setup input matrix
        Path path = new Path(input.getFileName());
        long rlen = input.getNumRows();
        long clen = input.getNumColumns();
        int brlen = (int) input.getNumRowsPerBlock();
        int bclen = (int) input.getNumColumnsPerBlock();
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo,
                oi, dpf._dpf, dpf._N, input.getFileName(), itervar, matrixvar, tSparseCol);
        job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
        FileInputFormat.setInputPaths(job, path);

        //set mapper and reducers classes
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(RemoteDPParWorkerReducer.class);

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema

        //parfor partitioning outputs (intermediates)
        job.setMapOutputKeyClass(LongWritable.class);
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            job.setMapOutputValueClass(PairWritableBlock.class);
        else if (oi == OutputInfo.BinaryCellOutputInfo)
            job.setMapOutputValueClass(PairWritableCell.class);
        else
            throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
        //parfor exec output
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumReduceTasks(numReducers);

        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //disable JVM reuse
        job.setNumTasksToExecutePerJvm(1); //-1 for unlimited 

        //set the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set the max number of retries per map task
        //note: currently disabled to use cluster config
        //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:org.apache.sysml.runtime.controlprogram.parfor.RemoteParForMR.java

License:Apache License

public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile,
        MatrixObject colocatedDPMatrixObj, //inputs
        boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;/* ww  w .j  av  a2  s  .c  om*/
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the mapper
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //set mappers, reducers, combiners
        job.setMapperClass(RemoteParWorkerMapper.class); //map-only

        //set input format (one split per row, NLineInputFormat default N=1)
        if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
            job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
            MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
            MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
            MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
            MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
            MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
        } else //default case 
        {
            job.setInputFormat(NLineInputFormat.class);
        }

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, new Path(taskFile));

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumMapTasks(numMappers); //numMappers
        job.setNumReduceTasks(0);
        //job.setInt("mapred.map.tasks.maximum", 1); //system property
        //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
        //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

        //set jvm memory size (if require)
        String memKey = MRConfigurationNames.MR_CHILD_JAVA_OPTS;
        if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
            InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
            LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
        job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8); //8MB

        //set the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:org.apache.sysml.runtime.matrix.CleanupMR.java

License:Apache License

public static boolean runJob(DMLConfig conf) throws Exception {
    boolean ret = false;

    try {//from   ww w.jav  a2  s  . c om
        JobConf job;
        job = new JobConf(CleanupMR.class);
        job.setJobName("Cleanup-MR");

        //set up SystemML local tmp dir
        String dir = conf.getTextValue(DMLConfig.LOCAL_TMP_DIR);
        MRJobConfiguration.setSystemMLLocalTmpDir(job, dir);

        //set mappers, reducers 
        int numNodes = InfrastructureAnalyzer.getRemoteParallelNodes();
        job.setMapperClass(CleanupMapper.class); //map-only
        job.setNumMapTasks(numNodes); //numMappers
        job.setNumReduceTasks(0);

        //set input/output format, input path
        String inFileName = conf.getTextValue(DMLConfig.SCRATCH_SPACE) + "/cleanup_tasks";
        job.setInputFormat(NLineInputFormat.class);
        job.setOutputFormat(NullOutputFormat.class);

        Path path = new Path(inFileName);
        FileInputFormat.setInputPaths(job, path);
        writeCleanupTasksToFile(path, numNodes);

        //disable automatic tasks timeouts and speculative task exec
        job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
        job.setMapSpeculativeExecution(false);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        ret = runjob.isSuccessful();
    } catch (Exception ex) {
        //don't raise an exception, just gracefully an error message.
        LOG.error("Failed to run cleanup MR job. ", ex);
    }

    return ret;
}

From source file:org.apache.sysml.runtime.matrix.CMCOVMR.java

License:Apache License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String cmNcomInstructions,
        int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(CMCOVMR.class);
    job.setJobName("CM-COV-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);

    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;//from   ww  w .  j a  v  a2 s  . c om

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens,
            true, ConvertTarget.WEIGHTEDCELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCM_N_COMInstructions(job, cmNcomInstructions);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, null, cmNcomInstructions, resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, new byte[resultIndexes.length], outputs,
            outputInfos, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CMCOVMRMapper.class);

    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(CM_N_COVCell.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.TagPartitioner.class);

    //configure reducer
    job.setReducerClass(CMCOVMRReducer.class);
    //job.setReducerClass(PassThroughReducer.class);

    MatrixCharacteristics[] stats = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, null, null, cmNcomInstructions, resultIndexes, mapoutputIndexes, false).stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, mapoutputIndexes.size(), numReducers);//each output tag is a group

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.CombineMR.java

License:Apache License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication,
        byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job;/* w ww  .  j a  va 2  s .com*/
    job = new JobConf(CombineMR.class);
    job.setJobName("Standalone-MR");

    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    byte[] inputIndexes = new byte[inputs.length];
    for (byte b = 0; b < inputs.length; b++)
        inputIndexes[b] = b;

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, true,
            inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, "");

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, "");

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, "");

    MRJobConfiguration.setCombineInstructions(job, combineInstructions);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, inputIndexes, null,
            null, combineInstructions, resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, null, outputs, outputInfos,
            inBlockRepresentation);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);

    job.setMapOutputKeyClass(MatrixIndexes.class);
    if (inBlockRepresentation)
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
    else
        job.setMapOutputValueClass(TaggedMatrixCell.class);

    //configure reducer
    job.setReducerClass(InnerReducer.class);
    //job.setReducerClass(PassThroughReducer.class);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, inputIndexes, null,
            null, null, combineInstructions, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    return new JobReturn(stats, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java

License:Apache License

private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos,
        long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions,
        String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes,
        String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception {
    JobConf job;//from w ww  .  java  2s  .c o m
    job = new JobConf(ReblockMR.class);
    job.setJobName("CSV-Reblock-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
            ConvertTarget.CELL);

    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            reblockInstructions, null, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes,
            false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS),
            ret.numReducerGroups);
    job.setNumReduceTasks(numRed);

    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //   inst.printCompelteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            resultDimsUnknown[i] = (byte) 1;
        } else {
            resultDimsUnknown[i] = (byte) 0;
        }
    }

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVReblockMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    Path cachefile = new Path(counterFile, "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(ROWID_FILE_NAME, cachefile.toString());

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        //   System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.DataGenMR.java

License:Apache License

/**
 * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
 * //from   ww  w. j av a 2 s . c o m
 * @param inst MR job instruction
 * @param dataGenInstructions array of data gen instructions
 * @param instructionsInMapper instructions in mapper
 * @param aggInstructionsInReducer aggregate instructions in reducer
 * @param otherInstructionsInReducer other instructions in reducer
 * @param numReducers number of reducers
 * @param replication file replication
 * @param resultIndexes result indexes for each random object
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs output file for each random object
 * @param outputInfos output information for each random object
 * @return matrix characteristics for each random object
 * @throws Exception if Exception occurs
 */
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(DataGenMR.class);
    job.setJobName("DataGen-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, true);

    byte[] realIndexes = new byte[dataGenInstructions.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    String[] inputs = new String[dataGenInstructions.length];
    InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
    long[] rlens = new long[dataGenInstructions.length];
    long[] clens = new long[dataGenInstructions.length];
    int[] brlens = new int[dataGenInstructions.length];
    int[] bclens = new int[dataGenInstructions.length];

    FileSystem fs = FileSystem.get(job);
    String dataGenInsStr = "";
    int numblocks = 0;
    int maxbrlen = -1, maxbclen = -1;
    double maxsparsity = -1;

    for (int i = 0; i < dataGenInstructions.length; i++) {
        dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];

        MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
        MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
        DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;

        rlens[i] = genInst.getRows();
        clens[i] = genInst.getCols();
        brlens[i] = genInst.getRowsInBlock();
        bclens[i] = genInst.getColsInBlock();

        maxbrlen = Math.max(maxbrlen, brlens[i]);
        maxbclen = Math.max(maxbclen, bclens[i]);

        if (mrtype == MRINSTRUCTION_TYPE.Rand) {
            RandInstruction randInst = (RandInstruction) mrins;
            inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
            maxsparsity = Math.max(maxsparsity, randInst.getSparsity());

            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));

                //for obj reuse and preventing repeated buffer re-allocations
                StringBuilder sb = new StringBuilder();

                //seed generation
                Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
                LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i],
                        randInst.getSparsity());
                PrimitiveIterator.OfLong nnzIter = nnz.iterator();
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
                    for (long c = 0; c < clens[i]; c += bclens[i]) {
                        long curBlockColSize = Math.min(bclens[i], (clens[i] - c));

                        sb.append((r / brlens[i]) + 1);
                        sb.append(',');
                        sb.append((c / bclens[i]) + 1);
                        sb.append(',');
                        sb.append(curBlockRowSize);
                        sb.append(',');
                        sb.append(curBlockColSize);
                        sb.append(',');
                        sb.append(nnzIter.nextLong());
                        sb.append(',');
                        sb.append(bigrand.nextLong());
                        pw.println(sb.toString());
                        sb.setLength(0);
                        numblocks++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
            SeqInstruction seqInst = (SeqInstruction) mrins;
            inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
            maxsparsity = 1.0; //always dense

            double from = seqInst.fromValue;
            double to = seqInst.toValue;
            double incr = seqInst.incrValue;

            //handle default 1 to -1 for special case of from>to
            incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);

            // Correctness checks on (from, to, incr)
            boolean neg = (from > to);
            if (incr == 0)
                throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");

            if (neg != (incr < 0))
                throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");

            // Compute the number of rows in the sequence
            long numrows = UtilFunctions.getSeqLength(from, to, incr);
            if (rlens[i] > 0) {
                if (numrows != rlens[i])
                    throw new DMLRuntimeException(
                            "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: "
                                    + rlens[i] + " != " + numrows);
            } else {
                rlens[i] = numrows;
            }

            if (clens[i] > 0 && clens[i] != 1)
                throw new DMLRuntimeException(
                        "Unexpected error while processing sequence instruction. Number of columns (" + clens[i]
                                + ") must be equal to 1.");
            else
                clens[i] = 1;

            PrintWriter pw = null;
            try {
                pw = new PrintWriter(fs.create(new Path(inputs[i])));
                StringBuilder sb = new StringBuilder();

                double temp = from;
                double block_from, block_to;
                for (long r = 0; r < rlens[i]; r += brlens[i]) {
                    long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));

                    // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) 
                    long bid_i = ((r / brlens[i]) + 1);
                    long bid_j = 1;
                    block_from = temp;
                    block_to = temp + (curBlockRowSize - 1) * incr;
                    temp = block_to + incr; // next block starts from here

                    sb.append(bid_i);
                    sb.append(',');
                    sb.append(bid_j);
                    sb.append(',');
                    sb.append(block_from);
                    sb.append(',');
                    sb.append(block_to);
                    sb.append(',');
                    sb.append(incr);

                    pw.println(sb.toString());
                    sb.setLength(0);
                    numblocks++;
                }
            } finally {
                IOUtilFunctions.closeSilently(pw);
            }
            inputInfos[i] = InputInfo.TextCellInputInfo;
        } else {
            throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
        }
    }
    dataGenInsStr = dataGenInsStr.substring(1);//remove the first ","
    RunningJob runjob;
    MatrixCharacteristics[] stats;
    try {
        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the input files and their format information
        MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
                ConvertTarget.BLOCK);

        //set up the dimensions of input matrices
        MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
        MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

        //set up the block size
        MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

        //set up the rand Instructions
        MRJobConfiguration.setRandInstructions(job, dataGenInsStr);

        //set up unary instructions that will perform in the mapper
        MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

        //set up the aggregate instructions that will happen in the combiner and reducer
        MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

        //set up the instructions that will happen in the reducer, after the aggregation instrucions
        MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

        //set up the replication factor for the results
        job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getDMLConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set up custom map/reduce configurations 
        MRJobConfiguration.setupCustomMRConfigurations(job, config);

        //determine degree of parallelism (nmappers: 1<=n<=capacity)
        //TODO use maxsparsity whenever we have a way of generating sparse rand data
        int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
        long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
        //correction max number of mappers on yarn clusters
        if (InfrastructureAnalyzer.isYarnEnabled())
            capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
        int nmapers = Math
                .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
        job.setNumMapTasks(nmapers);

        //set up what matrices are needed to pass from the mapper to reducer
        HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer,
                resultIndexes);

        MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
                dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null,
                otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
        stats = ret.stats;

        //set up the number of reducers
        MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

        // print the complete MRJob instruction
        if (LOG.isTraceEnabled())
            inst.printCompleteMRJobInstruction(stats);

        // Update resultDimsUnknown based on computed "stats"
        byte[] resultDimsUnknown = new byte[resultIndexes.length];
        for (int i = 0; i < resultIndexes.length; i++) {
            if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
                resultDimsUnknown[i] = (byte) 1;
            } else {
                resultDimsUnknown[i] = (byte) 0;
            }
        }

        boolean mayContainCtable = instructionsInMapper.contains("ctabletransform")
                || instructionsInMapper.contains("groupedagg");

        //set up the multiple output files, and their format information
        MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos,
                true, mayContainCtable);

        // configure mapper and the mapper output key value pairs
        job.setMapperClass(DataGenMapper.class);
        if (numReducers == 0) {
            job.setMapOutputKeyClass(Writable.class);
            job.setMapOutputValueClass(Writable.class);
        } else {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        }

        //set up combiner
        if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
            job.setCombinerClass(GMRCombiner.class);

        //configure reducer
        job.setReducerClass(GMRReducer.class);
        //job.setReducerClass(PassThroughReducer.class);

        // By default, the job executes in "cluster" mode.
        // Determine if we can optimize and run it in "local" mode.
        MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
        for (int i = 0; i < inputs.length; i++) {
            inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
        }

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        runjob = JobClient.runJob(job);

        /* Process different counters */

        Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
        for (int i = 0; i < resultIndexes.length; i++) {
            // number of non-zeros
            stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        }

        String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
        stats = MapReduceTool.processDimsFiles(dir, stats);
        MapReduceTool.deleteFileIfExistOnHDFS(dir);

    } finally {
        for (String input : inputs)
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.GMR.java

License:Apache License

/**
 * Execute job./*from w w  w  . jav  a  2 s . co  m*/
 * 
 * @param inst MR job instruction
 * @param inputs input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
 * @param inputInfos the input format information for the input matrices
 * @param rlens array of number of rows
 * @param clens array of number of columns
 * @param brlens array of number of rows in block
 * @param bclens array of number of columns in block
 * @param partitioned boolean array of partitioned status
 * @param pformats array of data partition formats
 * @param psizes does nothing
 * @param recordReaderInstruction record reader instruction
 * @param instructionsInMapper in Mapper, the set of unary operations that need to be performed on each input matrix
 * @param aggInstructionsInReducer in Reducer, right after sorting, the set of aggreagte operations
 * that need to be performed on each input matrix
 * @param otherInstructionsInReducer the mixed operations that need to be performed on matrices after the aggregate operations
 * @param numReducers the number of reducers
 * @param replication the replication factor for the output
 * @param jvmReuse if true, reuse JVM
 * @param resultIndexes the indexes of the result matrices that needs to be outputted
 * @param dimsUnknownFilePrefix file path prefix when dimensions unknown
 * @param outputs the names for the output directories, one for each result index
 * @param outputInfos output format information for the output matrices
 * @return job return object
 * @throws Exception if Exception occurs
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, boolean[] partitioned, PDataPartitionFormat[] pformats,
        int[] psizes, String recordReaderInstruction, String instructionsInMapper,
        String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication,
        boolean jvmReuse, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs,
        OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GMR.class);
    job.setJobName("G-MR");

    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;

    if (recordReaderInstruction != null && !recordReaderInstruction.isEmpty()) {
        assert (inputs.length <= 2);
        PickByCountInstruction ins = (PickByCountInstruction) PickByCountInstruction
                .parseInstruction(recordReaderInstruction);
        PickFromCompactInputFormat.setKeyValueClasses(job,
                (Class<? extends WritableComparable>) inputInfos[ins.input1].inputKeyClass,
                inputInfos[ins.input1].inputValueClass);
        job.setInputFormat(PickFromCompactInputFormat.class);
        PickFromCompactInputFormat.setZeroValues(job,
                (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata);

        if (ins.isValuePick) {
            double[] probs = MapReduceTool.readColumnVectorFromHDFS(inputs[ins.input2], inputInfos[ins.input2],
                    rlens[ins.input2], clens[ins.input2], brlens[ins.input2], bclens[ins.input2]);
            PickFromCompactInputFormat.setPickRecordsInEachPartFile(job,
                    (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, probs);

            realinputs = new String[inputs.length - 1];
            realinputInfos = new InputInfo[inputs.length - 1];
            realrlens = new long[inputs.length - 1];
            realclens = new long[inputs.length - 1];
            realbrlens = new int[inputs.length - 1];
            realbclens = new int[inputs.length - 1];
            realIndexes = new byte[inputs.length - 1];
            byte realIndex = 0;
            for (byte i = 0; i < inputs.length; i++) {
                if (i == ins.input2)
                    continue;
                realinputs[realIndex] = inputs[i];
                realinputInfos[realIndex] = inputInfos[i];
                if (i == ins.input1) {
                    realrlens[realIndex] = rlens[ins.input2];
                    realclens[realIndex] = clens[ins.input2];
                    realbrlens[realIndex] = 1;
                    realbclens[realIndex] = 1;
                    realIndexes[realIndex] = ins.output;
                } else {
                    realrlens[realIndex] = rlens[i];
                    realclens[realIndex] = clens[i];
                    realbrlens[realIndex] = brlens[i];
                    realbclens[realIndex] = bclens[i];
                    realIndexes[realIndex] = i;
                }
                realIndex++;
            }

        } else {
            //PickFromCompactInputFormat.setPickRecordsInEachPartFile(job, (NumItemsByEachReducerMetaData) inputInfos[ins.input1].metadata, ins.cst, 1-ins.cst);
            PickFromCompactInputFormat.setRangePickPartFiles(job,
                    (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst, 1 - ins.cst);
            realrlens[ins.input1] = UtilFunctions.getLengthForInterQuantile(
                    (MetaDataNumItemsByEachReducer) inputInfos[ins.input1].metadata, ins.cst);
            realclens[ins.input1] = clens[ins.input1];
            realbrlens[ins.input1] = 1;
            realbclens[ins.input1] = 1;
            realIndexes[ins.input1] = ins.output;
        }
    }

    boolean resetDistCache = setupDistributedCache(job, instructionsInMapper, otherInstructionsInReducer,
            realinputs, realrlens, realclens);

    //set up the input files and their format information
    boolean[] distCacheOnly = getDistCacheOnlyInputs(realIndexes, recordReaderInstruction, instructionsInMapper,
            aggInstructionsInReducer, otherInstructionsInReducer);
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens,
            distCacheOnly, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
    MRJobConfiguration.setInputPartitioningInfo(job, pformats);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

    //set up the instructions that will happen in the reducer, after the aggregation instructions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getDMLConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

    //set up custom map/reduce configurations 
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up jvm reuse (incl. reuse of loaded dist cache matrices)
    if (jvmReuse)
        job.setNumTasksToExecutePerJvm(-1);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes,
            mapoutputIndexes, false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }
    //MRJobConfiguration.updateResultDimsUnknown(job,resultDimsUnknown);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos,
            inBlockRepresentation, true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);
    if (numReducers == 0) {
        job.setMapOutputKeyClass(Writable.class);
        job.setMapOutputValueClass(Writable.class);
    } else {
        job.setMapOutputKeyClass(MatrixIndexes.class);
        if (inBlockRepresentation)
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
        else
            job.setMapOutputValueClass(TaggedMatrixPackedCell.class);
    }

    //set up combiner
    if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) {
        job.setCombinerClass(GMRCombiner.class);
    }

    //configure reducer
    job.setReducerClass(GMRReducer.class);
    //job.setReducerClass(PassThroughReducer.class);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++)
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));

    //cleanups
    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);
    if (resetDistCache)
        MRBaseForCommonInstructions.resetDistCache();

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.GroupedAggMR.java

License:Apache License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String grpAggInstructions,
        String simpleReduceInstructions/*only scalar or reorg instructions allowed*/, int numReducers,
        int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs,
        OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(GroupedAggMR.class);
    job.setJobName("GroupedAgg-MR");

    //whether use block representation or cell representation
    //MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);
    MRJobConfiguration.setMatrixValueClass(job, false);

    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;/*from ww w.j av  a 2  s .c  o m*/

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens,
            true, ConvertTarget.WEIGHTEDCELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
    MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);

    //set up the grouped aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setGroupedAggInstructions(job, grpAggInstructions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, simpleReduceInstructions);

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, numReducers, numReducers);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up what matrices are needed to pass from the mapper to reducer
    MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, null, grpAggInstructions,
            resultIndexes);

    MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++)
        stats[i] = new MatrixCharacteristics();

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    // Update resultDimsUnknown based on computed "stats"
    for (int i = 0; i < resultIndexes.length; i++)
        resultDimsUnknown[i] = (byte) 2;

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GroupedAggMRMapper.class);
    job.setCombinerClass(GroupedAggMRCombiner.class);
    job.setMapOutputKeyClass(TaggedMatrixIndexes.class);
    job.setMapOutputValueClass(WeightedCell.class);

    //configure reducer
    job.setReducerClass(GroupedAggMRReducer.class);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //execute job
    RunningJob runjob = JobClient.runJob(job);

    //get important output statistics 
    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i] = new MatrixCharacteristics();
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }

    String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
    stats = MapReduceTool.processDimsFiles(dir, stats);
    MapReduceTool.deleteFileIfExistOnHDFS(dir);

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.MMRJMR.java

License:Apache License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer,
        String aggBinInstrctions, String otherInstructionsInReducer, int numReducers, int replication,
        byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job = new JobConf(MMRJMR.class);
    job.setJobName("MMRJ-MR");

    if (numReducers <= 0)
        throw new Exception("MMRJ-MR has to have at least one reduce task!");

    // TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;/*from   w  w  w.j  a  va2 s  .c o  m*/

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true,
            inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);

    //set up the aggregate binary operation for the mmcj job
    MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrctions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up map/reduce memory configurations (if in AM context)
    DMLConfig config = ConfigurationManager.getDMLConfig();
    DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

    //set up custom map/reduce configurations 
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    // byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output};

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, otherInstructionsInReducer,
            resultIndexes, mapoutputIndexes, false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    byte[] dimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            dimsUnknown[i] = (byte) 1;
        } else {
            dimsUnknown[i] = (byte) 0;
        }
    }

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos,
            inBlockRepresentation);

    // configure mapper
    job.setMapperClass(MMRJMRMapper.class);
    job.setMapOutputKeyClass(TripleIndexes.class);
    if (inBlockRepresentation)
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
    else
        job.setMapOutputValueClass(TaggedMatrixCell.class);
    job.setOutputKeyComparatorClass(TripleIndexes.Comparator.class);
    job.setPartitionerClass(TripleIndexes.FirstTwoIndexesPartitioner.class);

    //configure combiner
    //TODO: cannot set up combiner, because it will destroy the stable numerical algorithms 
    // for sum or for central moments 

    //   if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
    //      job.setCombinerClass(MMCJMRCombiner.class);

    //configure reducer
    job.setReducerClass(MMRJMRReducer.class);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}