Example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) 

Source Link

Document

Set the value class for the map output data.

Usage

From source file:com.example.hadoop.mapreduce.test.MapReduceTest.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String input = HDFS_PATH + "/input/README.txt";
    String input2 = HDFS_PATH + "/input/README2.txt";
    String output = HDFS_PATH + "/test/output";

    // ?mapreduce???
    if (HdfsClient.exists(output)) {
        HdfsClient.rm(output);/*from  w w w  . jav  a 2  s . com*/
    }

    JobConf conf = new JobConf(MapReduceTest.class);
    conf.setJobName("MapReduceTest");
    conf.addResource("classpath:/hadoop/core-site.xml");
    conf.addResource("classpath:/hadoop/hdfs-site.xml");
    conf.addResource("classpath:/hadoop/mapred-site.xml");

    // mapper
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    // reducer
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    // mapper
    conf.setMapperClass(MapperTest.class);
    // combiner?????mapper??reducer?
    conf.setCombinerClass(ReducerTest.class);
    // reducer
    conf.setReducerClass(ReducerTest.class);

    // MapReduce?
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    // MapReduce?
    FileInputFormat.setInputPaths(conf, new Path[] { new Path(input), new Path(input2) });
    // MapReduce?
    FileOutputFormat.setOutputPath(conf, new Path(output));

    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.github.gaoyangthu.demo.mapred.DBCountPageView.java

License:Apache License

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {

    String driverClassName = DRIVER_CLASS;
    String url = DB_URL;/*from   w w w  .  j ava2 s  .co m*/

    if (args.length > 1) {
        driverClassName = args[0];
        url = args[1];
    }

    initialize(driverClassName, url);

    JobConf job = new JobConf(getConf(), DBCountPageView.class);

    job.setJobName("Count Pageviews of URLs");

    job.setMapperClass(PageviewMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(PageviewReducer.class);

    DBConfiguration.configureDB(job, driverClassName, url);

    DBInputFormat.setInput(job, AccessRecord.class, "Access", null, "url", AccessFieldNames);

    DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(PageviewRecord.class);
    job.setOutputValueClass(NullWritable.class);

    try {
        JobClient.runJob(job);

        boolean correct = verify();
        if (!correct) {
            throw new RuntimeException("Evaluation was not correct!");
        }
    } finally {
        shutdown();
    }
    return 0;
}

From source file:com.hadoop.secondarysort.SecondarySort_MapRed.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/*from  ww w.  j  ava  2s.  c om*/
    }

    JobConf jobConf = new JobConf(conf);
    jobConf.setMapperClass(MapClass.class);
    jobConf.setReducerClass(Reduce.class);

    jobConf.setPartitionerClass(FirstPartitioner.class);
    jobConf.setOutputValueGroupingComparator(FirstGroupingComparator.class);

    jobConf.setMapOutputKeyClass(IntPair.class);
    jobConf.setMapOutputValueClass(IntWritable.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(IntWritable.class);

    //
    // Job job = new Job(conf, "secondary sort");
    // job.setJarByClass(SecondarySort_MapRed.class);
    // job.setMapperClass(MapClass.class);
    // job.setReducerClass(Reduce.class);
    //
    // // group and partition by the first int in the pair
    // job.setPartitionerClass(FirstPartitioner.class);
    // job.setGroupingComparatorClass(FirstGroupingComparator.class);
    // conf.setClass("mapred.output.key.comparator.class",
    // KeyComparator.class, RawComparator.class);
    // // job.setSortComparatorClass(SecondGroupingComparator.class);
    // // the map output is IntPair, IntWritable
    // job.setMapOutputKeyClass(IntPair.class);
    // job.setMapOutputValueClass(IntWritable.class);
    //
    // // the reduce output is Text, IntWritable
    // job.setOutputKeyClass(Text.class);
    // job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(jobConf, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs[1]));

}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java

License:Open Source License

@Override
protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen,
        long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-DPMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(DataPartitionerRemoteMR.class);
    if (_pfid >= 0) //use in parfor
        job.setJobName(jobname + _pfid);
    else //use for partition instruction
        job.setJobName("Partition-MR");

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {/*  www.  ja  v  a2  s  . c  om*/
        //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size)
        in.exportData(); //written to disk iff dirty

        Path path = new Path(in.getFileName());

        /////
        //configure the MR job
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew,
                _keepIndexes);

        //set mappers, reducers, combiners
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(DataPartitionerRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            //binary cell intermediates for reduced IO 
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableBlock.class);

            //check Alignment
            if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0)
                    || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) {
                throw new DMLRuntimeException(
                        "Data partitioning format " + _format + " requires aligned blocks.");
            }
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, path);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        //FileOutputFormat.setOutputPath(job, pathNew);
        job.setOutputFormat(NullOutputFormat.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = -1;
        switch (_format) {
        case ROW_WISE:
            reducerGroups = rlen;
            break;
        case COLUMN_WISE:
            reducerGroups = clen;
            break;
        case ROW_BLOCK_WISE:
            reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE:
            reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1);
            break;
        case ROW_BLOCK_WISE_N:
            reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE_N:
            reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1);
            break;
        default:
            //do nothing
        }
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        /*if( ParForProgramBlock.USE_FLEX_SCHEDULER_CONF )
        {
           job.setInt("flex.map.min", 0);
           job.setInt("flex.map.max", _numMappers);
           job.setInt("flex.reduce.min", 0);
           job.setInt("flex.reduce.max", _numMappers);
        }*/

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   
        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS && _pfid >= 0) {
        long t1 = System.nanoTime(); //only for parfor 
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java

License:Open Source License

/**
 * /* ww  w .ja v a 2 s .  co m*/
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program,
        String resultFile, MatrixObject input, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params
        boolean enableCPCaching, int numReducers, int replication, int max_retry) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-DPEMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteDPParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the reducers
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //setup input matrix
        Path path = new Path(input.getFileName());
        long rlen = input.getNumRows();
        long clen = input.getNumColumns();
        int brlen = (int) input.getNumRowsPerBlock();
        int bclen = (int) input.getNumColumnsPerBlock();
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo,
                oi, dpf, 1, input.getFileName(), itervar, matrixvar, tSparseCol);
        job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
        FileInputFormat.setInputPaths(job, path);

        //set mapper and reducers classes
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(RemoteDPParWorkerReducer.class);

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema

        //parfor partitioning outputs (intermediates)
        job.setMapOutputKeyClass(LongWritable.class);
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            job.setMapOutputValueClass(PairWritableBlock.class);
        else if (oi == OutputInfo.BinaryCellOutputInfo)
            job.setMapOutputValueClass(PairWritableCell.class);
        else
            throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
        //parfor exec output
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumReduceTasks(numReducers);

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //disable JVM reuse
        job.setNumTasksToExecutePerJvm(1); //-1 for unlimited 

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //note: currently disabled to use cluster config
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java

License:Open Source License

/**
 * //from ww w .j  av a  2s .  co m
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param _enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile,
        MatrixObject colocatedDPMatrixObj, //inputs
        boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the mapper
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //set mappers, reducers, combiners
        job.setMapperClass(RemoteParWorkerMapper.class); //map-only

        //set input format (one split per row, NLineInputFormat default N=1)
        if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
            job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
            MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
            MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
            MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
            MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
            MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
        } else //default case 
        {
            job.setInputFormat(NLineInputFormat.class);
        }

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, new Path(taskFile));

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumMapTasks(numMappers); //numMappers
        job.setNumReduceTasks(0);
        //job.setInt("mapred.map.tasks.maximum", 1); //system property
        //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
        //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.priority", 0); //highest

            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", numMappers);
        }

        //set jvm memory size (if require)
        String memKey = "mapred.child.java.opts";
        if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
            InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
            LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
        job.setInt("io.sort.mb", 8); //8MB

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java

License:Open Source License

/**
 * /*w  w  w .  ja v  a2s .c  o m*/
 * @param fname    null if no comparison required
 * @param fnameNew
 * @param srcFnames
 * @param ii
 * @param oi
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi,
        long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(ResultMergeRemoteMR.class);
    job.setJobName(jobname + _pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    //warning for textcell/binarycell without compare
    boolean withCompare = (fname != null);
    if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare
            && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
        LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi)
                + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");

    try {
        Path pathCompare = null;
        Path pathNew = new Path(fnameNew);

        /////
        //configure the MR job
        if (withCompare) {
            pathCompare = new Path(fname).makeQualified(FileSystem.get(job));
            MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen,
                    bclen);
        } else
            MRJobConfiguration.setResultMergeInfo(job, "null", ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen,
                    bclen);

        //set mappers, reducers, combiners
        job.setMapperClass(ResultMergeRemoteMapper.class);
        job.setReducerClass(ResultMergeRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            //setup partitioning, grouping, sorting for composite key (old API)
            job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning
            job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping
            job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting

            job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixBlock.class);
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path 
        Path[] paths = null;
        if (withCompare) {
            paths = new Path[srcFnames.length + 1];
            paths[0] = pathCompare;
            for (int i = 1; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i - 1]);
        } else {
            paths = new Path[srcFnames.length];
            for (int i = 0; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i]);
        }
        FileInputFormat.setInputPaths(job, paths);

        //set output format
        job.setOutputFormat(oi.outputFormatClass);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        FileOutputFormat.setOutputPath(job, pathNew);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = _numReducers;
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
        else //textcell/binarycell
            reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", _numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", _numMappers);
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   

        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.CMCOVMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String cmNcomInstructions,
        int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos)
        throws Exception {
    JobConf job = new JobConf(CMCOVMR.class);
    job.setJobName("CM-COV-MR");

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);

    //added for handling recordreader instruction
    String[] realinputs = inputs;
    InputInfo[] realinputInfos = inputInfos;
    long[] realrlens = rlens;
    long[] realclens = clens;
    int[] realbrlens = brlens;
    int[] realbclens = bclens;
    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;/*from ww w  . j av a  2s . co m*/

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens,
            true, ConvertTarget.WEIGHTEDCELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCM_N_COMInstructions(job, cmNcomInstructions);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes,
            instructionsInMapper, null, cmNcomInstructions, resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, new byte[resultIndexes.length], outputs,
            outputInfos, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CMCOVMRMapper.class);

    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(CM_N_COVCell.class);
    job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
    job.setPartitionerClass(TaggedFirstSecondIndexes.TagPartitioner.class);

    //configure reducer
    job.setReducerClass(CMCOVMRReducer.class);
    //job.setReducerClass(PassThroughReducer.class);

    MatrixCharacteristics[] stats = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
            instructionsInMapper, null, null, cmNcomInstructions, resultIndexes, mapoutputIndexes, false).stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, mapoutputIndexes.size(), numReducers);//each output tag is a group

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.matrix.CombineMR.java

License:Open Source License

public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens,
        long[] clens, int[] brlens, int[] bclens, String combineInstructions, int numReducers, int replication,
        byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
    JobConf job;
    job = new JobConf(CombineMR.class);
    job.setJobName("Standalone-MR");

    boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);

    //whether use block representation or cell representation
    MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);

    byte[] inputIndexes = new byte[inputs.length];
    for (byte b = 0; b < inputs.length; b++)
        inputIndexes[b] = b;//from   www .  j  a  v  a2s.co m

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, true,
            inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, inputIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, inputIndexes, brlens, bclens);

    //set up unary instructions that will perform in the mapper
    MRJobConfiguration.setInstructionsInMapper(job, "");

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setAggregateInstructions(job, "");

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, "");

    MRJobConfiguration.setCombineInstructions(job, combineInstructions);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, inputIndexes, null,
            null, combineInstructions, resultIndexes);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, null, outputs, outputInfos,
            inBlockRepresentation);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(GMRMapper.class);

    job.setMapOutputKeyClass(MatrixIndexes.class);
    if (inBlockRepresentation)
        job.setMapOutputValueClass(TaggedMatrixBlock.class);
    else
        job.setMapOutputValueClass(TaggedMatrixCell.class);

    //configure reducer
    job.setReducerClass(InnerReducer.class);
    //job.setReducerClass(PassThroughReducer.class);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, inputIndexes, null,
            null, null, combineInstructions, resultIndexes, mapoutputIndexes, false);
    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(stats);

    // By default, the job executes in "cluster" mode.
    // Determine if we can optimize and run it in "local" mode.
    MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
    }

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    RunningJob runjob = JobClient.runJob(job);

    return new JobReturn(stats, runjob.isSuccessful());
}

From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java

License:Open Source License

public static AssignRowIDMRReturn runAssignRowIDMRJob(String[] inputs, InputInfo[] inputInfos, int[] brlens,
        int[] bclens, String reblockInstructions, int replication, String[] smallestFiles, boolean transform,
        String naStrings, String specFile) throws Exception {
    AssignRowIDMRReturn ret = new AssignRowIDMRReturn();
    JobConf job;
    job = new JobConf(CSVReblockMR.class);
    job.setJobName("Assign-RowID-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;//from   w  w w  .j a v a 2s . c o m

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
            ConvertTarget.CELL);

    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);

    //set up the replication factor for the results
    job.setInt("dfs.replication", replication);

    //set up the number of reducers
    job.setNumReduceTasks(1);

    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //inst.printCompelteMRJobInstruction();

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVAssignRowIDMapper.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(OffsetCount.class);

    //configure reducer
    job.setReducerClass(CSVAssignRowIDReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //set up the output file
    ret.counterFile = new Path(MRJobConfiguration.constructTempOutputFilename());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, ret.counterFile);
    job.setOutputKeyClass(ByteWritable.class);
    job.setOutputValueClass(OffsetCount.class);

    // setup properties relevant to transform
    job.setBoolean(MRJobConfiguration.TF_TRANSFORM, transform);
    if (transform) {
        if (naStrings != null)
            // Adding "dummy" string to handle the case of na_strings = ""
            job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(naStrings));
        job.set(MRJobConfiguration.TF_SPEC_FILE, specFile);
    }

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group rgroup = runjob.getCounters().getGroup(NUM_ROWS_IN_MATRIX);
    Group cgroup = runjob.getCounters().getGroup(NUM_COLS_IN_MATRIX);
    ret.rlens = new long[inputs.length];
    ret.clens = new long[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        // number of non-zeros
        ret.rlens[i] = rgroup.getCounter(Integer.toString(i));
        ret.clens[i] = cgroup.getCounter(Integer.toString(i));
    }
    return ret;
}