Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java

License:Open Source License

/**
 * /*w w w . j  a v  a2s . c om*/
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program,
        String resultFile, MatrixObject input, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params
        boolean enableCPCaching, int numReducers, int replication, int max_retry) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-DPEMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteDPParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the reducers
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //setup input matrix
        Path path = new Path(input.getFileName());
        long rlen = input.getNumRows();
        long clen = input.getNumColumns();
        int brlen = (int) input.getNumRowsPerBlock();
        int bclen = (int) input.getNumColumnsPerBlock();
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo,
                oi, dpf, 1, input.getFileName(), itervar, matrixvar, tSparseCol);
        job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
        FileInputFormat.setInputPaths(job, path);

        //set mapper and reducers classes
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(RemoteDPParWorkerReducer.class);

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema

        //parfor partitioning outputs (intermediates)
        job.setMapOutputKeyClass(LongWritable.class);
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            job.setMapOutputValueClass(PairWritableBlock.class);
        else if (oi == OutputInfo.BinaryCellOutputInfo)
            job.setMapOutputValueClass(PairWritableCell.class);
        else
            throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
        //parfor exec output
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumReduceTasks(numReducers);

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //disable JVM reuse
        job.setNumTasksToExecutePerJvm(1); //-1 for unlimited 

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //note: currently disabled to use cluster config
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java

License:Open Source License

/**
 * Get the list of hostnames where the input split is located.
 *///from  ww  w .  j  a  v a  2  s .c o  m
@Override
public String[] getLocations() throws IOException {
    //Timing time = new Timing();
    //time.start();

    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);

    //read task string
    LongWritable key = new LongWritable();
    Text value = new Text();
    RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
    reader.next(key, value);
    reader.close();

    //parse task
    Task t = Task.parseCompactString(value.toString());

    //get all locations
    HashMap<String, Integer> hosts = new HashMap<String, Integer>();

    if (t.getType() == TaskType.SET) {
        for (IntObject val : t.getIterations()) {
            String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }
    } else //TaskType.RANGE
    {
        //since this is a serial process, we use just the first iteration
        //as a heuristic for location information
        long lFrom = t.getIterations().get(0).getLongValue();
        long lTo = t.getIterations().get(1).getLongValue();
        for (long li : new long[] { lFrom, lTo }) {
            String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }

        /*
        int lFrom  = t.getIterations().get(0).getIntValue();
        int lTo    = t.getIterations().get(1).getIntValue();
        int lIncr  = t.getIterations().get(2).getIntValue();            
        for( int i=lFrom; i<=lTo; i+=lIncr )
        {
           String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) );
           FileSystem fs = FileSystem.get(job);
           FileStatus status = fs.getFileStatus(new Path(fname)); 
           BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
           for( BlockLocation bl : tmp1 )
              countHosts(hosts, bl.getHosts());
        }*/
    }

    //System.out.println("Get locations "+time.stop()+"");

    //majority consensus on top host
    return getTopHosts(hosts);
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java

License:Open Source License

/**
 * //from   w  ww  .ja va2  s .  com
 * @param pfid
 * @param program
 * @param taskFile
 * @param resultFile
 * @param _enableCPCaching 
 * @param mode
 * @param numMappers
 * @param replication
 * @return
 * @throws DMLRuntimeException
 */
public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile,
        MatrixObject colocatedDPMatrixObj, //inputs
        boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params
        throws DMLRuntimeException {
    RemoteParForJobReturn ret = null;
    String jobname = "ParFor-EMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(RemoteParForMR.class);
    job.setJobName(jobname + pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        /////
        //configure the MR job

        //set arbitrary CP program blocks that will perform in the mapper
        MRJobConfiguration.setProgramBlocks(job, program);

        //enable/disable caching
        MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);

        //set mappers, reducers, combiners
        job.setMapperClass(RemoteParWorkerMapper.class); //map-only

        //set input format (one split per row, NLineInputFormat default N=1)
        if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
            job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
            MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
            MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
            MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
            MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
            MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
        } else //default case 
        {
            job.setInputFormat(NLineInputFormat.class);
        }

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, new Path(taskFile));

        //set output format
        job.setOutputFormat(SequenceFileOutputFormat.class);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
        FileOutputFormat.setOutputPath(job, new Path(resultFile));

        //set the output key, value schema
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        job.setNumMapTasks(numMappers); //numMappers
        job.setNumReduceTasks(0);
        //job.setInt("mapred.map.tasks.maximum", 1); //system property
        //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
        //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.priority", 0); //highest

            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", numMappers);
        }

        //set jvm memory size (if require)
        String memKey = "mapred.child.java.opts";
        if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
            InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
            LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
        job.setInt("io.sort.mb", 8); //8MB

        //set the replication factor for the results
        job.setInt("dfs.replication", replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job         
        RunningJob runjob = JobClient.runJob(job);

        // Process different counters 
        Statistics.incrementNoOfExecutedMRJobs();
        Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
        int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
        int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
        if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
            Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
            Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
            Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
            Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
            CacheStatistics
                    .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
            CacheStatistics.incrementFSBuffHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
            CacheStatistics
                    .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
            CacheStatistics.incrementHDFSHits(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
            CacheStatistics.incrementFSBuffWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
            CacheStatistics.incrementFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
            CacheStatistics.incrementHDFSWrites(
                    (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
            CacheStatistics
                    .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
            CacheStatistics
                    .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
            CacheStatistics
                    .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
            CacheStatistics
                    .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
        }

        // read all files of result variables and prepare for return
        LocalVariableMap[] results = readResultFile(job, resultFile);

        ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    } finally {
        // remove created files 
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
            MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
        } catch (IOException ex) {
            throw new DMLRuntimeException(ex);
        }
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }

    return ret;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * //www.jav  a2 s. c  o m
 * @param fnameNew
 * @param outMo
 * @param inMO
 * @throws DMLRuntimeException
 */
private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {
        //delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);

        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            return; //we're done
        }

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fnameNew);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        String valueStr = null;

        try {
            for (MatrixObject in : inMO) //read/write all inputs
            {
                LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname="
                        + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);

                LongWritable key = new LongWritable();
                Text value = new Text();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob,
                            Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }
            }
        } finally {
            if (out != null)
                out.close();
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*w ww .  ja va2 s. c o  m*/
 * @param fnameNew
 * @param outMo
 * @param inMO
 * @throws DMLRuntimeException
 */
@SuppressWarnings("deprecation")
private void mergeBinaryCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {
        //delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);

        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            return; //we're done
        }

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fnameNew);
        SequenceFile.Writer out = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixCell.class); //beware ca 50ms

        MatrixIndexes key = new MatrixIndexes();
        MatrixCell value = new MatrixCell();

        try {
            for (MatrixObject in : inMO) //read/write all inputs
            {
                LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname="
                        + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());

                for (Path lpath : MatrixReader.getSequenceFilePaths(fs, tmpPath)) {
                    SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, tmpJob);
                    try {
                        while (reader.next(key, value)) {
                            out.append(key, value);
                        }
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }
            }
        } finally {
            if (out != null)
                out.close();
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge binary cell results.", ex);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*from  ww  w. j a  va2  s . c  o  m*/
 * @param fnameStaging
 * @param mo
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private void createBinaryBlockStagingFile(String fnameStaging, MatrixObject mo) throws IOException {
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(tmpJob);
    Path tmpPath = new Path(mo.getFileName());

    for (Path lpath : MatrixReader.getSequenceFilePaths(fs, tmpPath)) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, tmpJob);
        try {
            while (reader.next(key, value)) //for each block
            {
                String lname = key.getRowIndex() + "_" + key.getColumnIndex();
                String dir = fnameStaging + "/" + lname;
                if (value.getNonZeros() > 0) //write only non-empty blocks
                {
                    LocalFileUtils.checkAndCreateStagingDir(dir);
                    LocalFileUtils.writeMatrixBlockToLocal(dir + "/" + _seq.getNextID(), value);
                }
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * //from  w ww  .j  a  v a2 s  .co m
 * @param fnameStaging
 * @param mo
 * @param ID
 * @throws IOException
 * @throws DMLRuntimeException
 */

private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LinkedList<Cell> buffer = new LinkedList<Cell>();
    LongWritable key = new LongWritable();
    Text value = new Text();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)

    FastStringTokenizer st = new FastStringTokenizer(' ');

    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                st.reset(value.toString()); //reset tokenizer
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());

                Cell tmp = new Cell(row, col, lvalue);

                buffer.addLast(tmp);
                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*  w w  w.  j  a v a2  s.c o m*/
 * @param fnameStaging
 * @param mo
 * @param ID
 * @throws IOException
 * @throws DMLRuntimeException
 */
@SuppressWarnings("deprecation")
private void createBinaryCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileSystem fs = FileSystem.get(job);

    LinkedList<Cell> buffer = new LinkedList<Cell>();
    MatrixIndexes key = new MatrixIndexes();
    MatrixCell value = new MatrixCell();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();

    for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);
        try {
            while (reader.next(key, value)) {
                Cell tmp = new Cell(key.getRowIndex(), key.getColumnIndex(), value.getValue());

                buffer.addLast(tmp);
                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*from   w  w w .  jav a 2s .co  m*/
 * @param fnameStaging
 * @param fnameStagingCompare
 * @param fnameNew
 * @param metadata
 * @param withCompare
 * @throws IOException
 * @throws DMLRuntimeException
 */
@SuppressWarnings("deprecation")
private void createBinaryBlockResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew,
        MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fnameNew);

    MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
    long rlen = mc.getRows();
    long clen = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class); //beware ca 50ms
    try {
        MatrixIndexes indexes = new MatrixIndexes();
        for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++)
            for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
                File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
                File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
                MatrixBlock mb = null;

                if (dir.exists()) {
                    if (withCompare && dir2.exists()) //WITH COMPARE BLOCK
                    {
                        //copy only values that are different from the original
                        String[] lnames2 = dir2.list();
                        if (lnames2.length != 1) //there should be exactly 1 compare block
                            throw new DMLRuntimeException(
                                    "Unable to merge results because multiple compare blocks found.");
                        mb = LocalFileUtils.readMatrixBlockFromLocal(dir2 + "/" + lnames2[0]);
                        boolean appendOnly = mb.isInSparseFormat();
                        double[][] compare = DataConverter.convertToDoubleMatrix(mb);

                        String[] lnames = dir.list();
                        for (String lname : lnames) {
                            MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
                            mergeWithComp(mb, tmp, compare);
                        }

                        //sort sparse due to append-only
                        if (appendOnly)
                            mb.sortSparseRows();

                        //change sparsity if required after 
                        mb.examSparsity();
                    } else //WITHOUT COMPARE BLOCK
                    {
                        //copy all non-zeros from all workers
                        String[] lnames = dir.list();
                        boolean appendOnly = false;
                        for (String lname : lnames) {
                            if (mb == null) {
                                mb = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
                                appendOnly = mb.isInSparseFormat();
                            } else {
                                MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname);
                                mergeWithoutComp(mb, tmp, appendOnly);
                            }
                        }

                        //sort sparse due to append-only
                        if (appendOnly)
                            mb.sortSparseRows();

                        //change sparsity if required after 
                        mb.examSparsity();
                    }
                } else {
                    //NOTE: whenever runtime does not need all blocks anymore, this can be removed
                    int maxRow = (int) (((brow - 1) * brlen + brlen < rlen) ? brlen
                            : rlen - (brow - 1) * brlen);
                    int maxCol = (int) (((bcol - 1) * bclen + bclen < clen) ? bclen
                            : clen - (bcol - 1) * bclen);

                    mb = new MatrixBlock(maxRow, maxCol, true);
                }

                //mb.examSparsity(); //done on write anyway and mb not reused
                indexes.setIndexes(brow, bcol);
                writer.append(indexes, mb);
            }
    } finally {
        if (writer != null)
            writer.close();
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*from   w  ww  .j a va 2  s .  co  m*/
 * @param fnameStaging
 * @param fnameStagingCompare
 * @param fnameNew
 * @param metadata
 * @param withCompare
 * @throws IOException
 * @throws DMLRuntimeException
 */
private void createTextCellResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew,
        MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fnameNew);

    MatrixCharacteristics mc = metadata.getMatrixCharacteristics();
    long rlen = mc.getRows();
    long clen = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();

    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
    try {
        //for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();

        boolean written = false;
        for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++)
            for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) {
                File dir = new File(fnameStaging + "/" + brow + "_" + bcol);
                File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol);
                MatrixBlock mb = null;

                long row_offset = (brow - 1) * brlen + 1;
                long col_offset = (bcol - 1) * bclen + 1;

                if (dir.exists()) {
                    if (withCompare && dir2.exists()) //WITH COMPARE BLOCK
                    {
                        //copy only values that are different from the original
                        String[] lnames2 = dir2.list();
                        if (lnames2.length != 1) //there should be exactly 1 compare block
                            throw new DMLRuntimeException(
                                    "Unable to merge results because multiple compare blocks found.");
                        mb = StagingFileUtils.readCellList2BlockFromLocal(dir2 + "/" + lnames2[0], brlen,
                                bclen);
                        boolean appendOnly = mb.isInSparseFormat();
                        double[][] compare = DataConverter.convertToDoubleMatrix(mb);

                        String[] lnames = dir.list();
                        for (String lname : lnames) {
                            MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname,
                                    brlen, bclen);
                            mergeWithComp(mb, tmp, compare);
                        }

                        //sort sparse and exam sparsity due to append-only
                        if (appendOnly)
                            mb.sortSparseRows();

                        //change sparsity if required after 
                        mb.examSparsity();
                    } else //WITHOUT COMPARE BLOCK
                    {
                        //copy all non-zeros from all workers
                        String[] lnames = dir.list();
                        boolean appendOnly = false;
                        for (String lname : lnames) {
                            if (mb == null) {
                                mb = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen,
                                        bclen);
                                appendOnly = mb.isInSparseFormat();
                            } else {
                                MatrixBlock tmp = StagingFileUtils
                                        .readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen);
                                mergeWithoutComp(mb, tmp, appendOnly);
                            }
                        }

                        //sort sparse due to append-only
                        if (appendOnly)
                            mb.sortSparseRows();

                        //change sparsity if required after 
                        mb.examSparsity();
                    }
                }

                //write the block to text cell
                if (mb != null) {
                    if (mb.isInSparseFormat()) {
                        SparseRowsIterator iter = mb.getSparseRowsIterator();
                        while (iter.hasNext()) {
                            IJV lcell = iter.next();
                            sb.append(row_offset + lcell.i);
                            sb.append(' ');
                            sb.append(col_offset + lcell.j);
                            sb.append(' ');
                            sb.append(lcell.v);
                            sb.append('\n');
                            out.write(sb.toString());
                            sb.setLength(0);
                            written = true;
                        }
                    } else {
                        for (int i = 0; i < brlen; i++)
                            for (int j = 0; j < bclen; j++) {
                                double lvalue = mb.getValueDenseUnsafe(i, j);
                                if (lvalue != 0) //for nnz
                                {
                                    sb.append(row_offset + i);
                                    sb.append(' ');
                                    sb.append(col_offset + j);
                                    sb.append(' ');
                                    sb.append(lvalue);
                                    sb.append('\n');
                                    out.write(sb.toString());
                                    sb.setLength(0);
                                    written = true;
                                }
                            }
                    }
                }
            }

        if (!written)
            out.write("1 1 0\n");
    } finally {
        if (out != null)
            out.close();
    }
}