List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java
License:Open Source License
/** * /*w w w . j a v a2s . c om*/ * @param pfid * @param program * @param taskFile * @param resultFile * @param enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, String resultFile, MatrixObject input, PDataPartitionFormat dpf, OutputInfo oi, boolean tSparseCol, //config params boolean enableCPCaching, int numReducers, int replication, int max_retry) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-DPEMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteDPParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the reducers MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //setup input matrix Path path = new Path(input.getFileName()); long rlen = input.getNumRows(); long clen = input.getNumColumns(); int brlen = (int) input.getNumRowsPerBlock(); int bclen = (int) input.getNumColumnsPerBlock(); MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf, 1, input.getFileName(), itervar, matrixvar, tSparseCol); job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass); FileInputFormat.setInputPaths(job, path); //set mapper and reducers classes job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(RemoteDPParWorkerReducer.class); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema //parfor partitioning outputs (intermediates) job.setMapOutputKeyClass(LongWritable.class); if (oi == OutputInfo.BinaryBlockOutputInfo) job.setMapOutputValueClass(PairWritableBlock.class); else if (oi == OutputInfo.BinaryCellOutputInfo) job.setMapOutputValueClass(PairWritableCell.class); else throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi); //parfor exec output job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumReduceTasks(numReducers); //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //disable JVM reuse job.setNumTasksToExecutePerJvm(1); //-1 for unlimited //set the replication factor for the results job.setInt("dfs.replication", replication); //set the max number of retries per map task //note: currently disabled to use cluster config //job.setInt("mapreduce.map.maxattempts", max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java
License:Open Source License
/** * Get the list of hostnames where the input split is located. *///from ww w . j a v a 2 s .c o m @Override public String[] getLocations() throws IOException { //Timing time = new Timing(); //time.start(); JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); //read task string LongWritable key = new LongWritable(); Text value = new Text(); RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL); reader.next(key, value); reader.close(); //parse task Task t = Task.parseCompactString(value.toString()); //get all locations HashMap<String, Integer> hosts = new HashMap<String, Integer>(); if (t.getType() == TaskType.SET) { for (IntObject val : t.getIterations()) { String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } } else //TaskType.RANGE { //since this is a serial process, we use just the first iteration //as a heuristic for location information long lFrom = t.getIterations().get(0).getLongValue(); long lTo = t.getIterations().get(1).getLongValue(); for (long li : new long[] { lFrom, lTo }) { String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } /* int lFrom = t.getIterations().get(0).getIntValue(); int lTo = t.getIterations().get(1).getIntValue(); int lIncr = t.getIterations().get(2).getIntValue(); for( int i=lFrom; i<=lTo; i+=lIncr ) { String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) ); FileSystem fs = FileSystem.get(job); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for( BlockLocation bl : tmp1 ) countHosts(hosts, bl.getHosts()); }*/ } //System.out.println("Get locations "+time.stop()+""); //majority consensus on top host return getTopHosts(hosts); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForMR.java
License:Open Source License
/** * //from w ww .ja va2 s . com * @param pfid * @param program * @param taskFile * @param resultFile * @param _enableCPCaching * @param mode * @param numMappers * @param replication * @return * @throws DMLRuntimeException */ public static RemoteParForJobReturn runJob(long pfid, String program, String taskFile, String resultFile, MatrixObject colocatedDPMatrixObj, //inputs boolean enableCPCaching, int numMappers, int replication, int max_retry, long minMem, boolean jvmReuse) //opt params throws DMLRuntimeException { RemoteParForJobReturn ret = null; String jobname = "ParFor-EMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(RemoteParForMR.class); job.setJobName(jobname + pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { ///// //configure the MR job //set arbitrary CP program blocks that will perform in the mapper MRJobConfiguration.setProgramBlocks(job, program); //enable/disable caching MRJobConfiguration.setParforCachingConfig(job, enableCPCaching); //set mappers, reducers, combiners job.setMapperClass(RemoteParWorkerMapper.class); //map-only //set input format (one split per row, NLineInputFormat default N=1) if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) { job.setInputFormat(RemoteParForColocatedNLineInputFormat.class); MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat()); MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics(); MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock()); MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock()); MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName()); } else //default case { job.setInputFormat(NLineInputFormat.class); } //set the input path and output path FileInputFormat.setInputPaths(job, new Path(taskFile)); //set output format job.setOutputFormat(SequenceFileOutputFormat.class); //set output path MapReduceTool.deleteFileIfExistOnHDFS(resultFile); FileOutputFormat.setOutputPath(job, new Path(resultFile)); //set the output key, value schema job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); ////// //set optimization parameters //set the number of mappers and reducers job.setNumMapTasks(numMappers); //numMappers job.setNumReduceTasks(0); //job.setInt("mapred.map.tasks.maximum", 1); //system property //job.setInt("mapred.tasktracker.tasks.maximum",1); //system property //job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property //use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.priority", 0); //highest job.setInt("flex.map.min", 0); job.setInt("flex.map.max", numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", numMappers); } //set jvm memory size (if require) String memKey = "mapred.child.java.opts"; if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) { InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem); LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M."); } //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //enables the reuse of JVMs (multiple tasks per MR task) if (jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption) job.setInt("io.sort.mb", 8); //8MB //set the replication factor for the results job.setInt("dfs.replication", replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job RunningJob runjob = JobClient.runJob(job); // Process different counters Statistics.incrementNoOfExecutedMRJobs(); Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME); int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString()); int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString()); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) { Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString())); Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString())); Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString())); Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString()); CacheStatistics .incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString())); CacheStatistics.incrementFSBuffHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString())); CacheStatistics .incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString())); CacheStatistics.incrementHDFSHits( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString())); CacheStatistics.incrementFSBuffWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString())); CacheStatistics.incrementFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString())); CacheStatistics.incrementHDFSWrites( (int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString())); CacheStatistics .incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString())); CacheStatistics .incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString())); CacheStatistics .incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString())); CacheStatistics .incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString())); } // read all files of result variables and prepare for return LocalVariableMap[] results = readResultFile(job, resultFile); ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results); } catch (Exception ex) { throw new DMLRuntimeException(ex); } finally { // remove created files try { MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job); MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job); } catch (IOException ex) { throw new DMLRuntimeException(ex); } } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } return ret; }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * //www.jav a2 s. c o m * @param fnameNew * @param outMo * @param inMO * @throws DMLRuntimeException */ private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) throws DMLRuntimeException { try { //delete target file if already exists MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); if (ALLOW_COPY_CELLFILES) { copyAllFiles(fnameNew, inMO); return; //we're done } //actual merge JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); String valueStr = null; try { for (MatrixObject in : inMO) //read/write all inputs { LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname=" + in.getFileName() + ") via stream merge"); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); Path tmpPath = new Path(in.getFileName()); FileInputFormat.addInputPath(tmpJob, tmpPath); TextInputFormat informat = new TextInputFormat(); informat.configure(tmpJob); InputSplit[] splits = informat.getSplits(tmpJob, 1); LongWritable key = new LongWritable(); Text value = new Text(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL); try { while (reader.next(key, value)) { valueStr = value.toString().trim(); out.write(valueStr + "\n"); } } finally { if (reader != null) reader.close(); } } } } finally { if (out != null) out.close(); } } catch (Exception ex) { throw new DMLRuntimeException("Unable to merge text cell results.", ex); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /*w ww . ja va2 s. c o m*/ * @param fnameNew * @param outMo * @param inMO * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private void mergeBinaryCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) throws DMLRuntimeException { try { //delete target file if already exists MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); if (ALLOW_COPY_CELLFILES) { copyAllFiles(fnameNew, inMO); return; //we're done } //actual merge JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); SequenceFile.Writer out = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixCell.class); //beware ca 50ms MatrixIndexes key = new MatrixIndexes(); MatrixCell value = new MatrixCell(); try { for (MatrixObject in : inMO) //read/write all inputs { LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname=" + in.getFileName() + ") via stream merge"); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); Path tmpPath = new Path(in.getFileName()); for (Path lpath : MatrixReader.getSequenceFilePaths(fs, tmpPath)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, tmpJob); try { while (reader.next(key, value)) { out.append(key, value); } } finally { if (reader != null) reader.close(); } } } } finally { if (out != null) out.close(); } } catch (Exception ex) { throw new DMLRuntimeException("Unable to merge binary cell results.", ex); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /*from ww w. j a va2 s . c o m*/ * @param fnameStaging * @param mo * @throws IOException */ @SuppressWarnings("deprecation") private void createBinaryBlockStagingFile(String fnameStaging, MatrixObject mo) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(tmpJob); Path tmpPath = new Path(mo.getFileName()); for (Path lpath : MatrixReader.getSequenceFilePaths(fs, tmpPath)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, tmpJob); try { while (reader.next(key, value)) //for each block { String lname = key.getRowIndex() + "_" + key.getColumnIndex(); String dir = fnameStaging + "/" + lname; if (value.getNonZeros() > 0) //write only non-empty blocks { LocalFileUtils.checkAndCreateStagingDir(dir); LocalFileUtils.writeMatrixBlockToLocal(dir + "/" + _seq.getNextID(), value); } } } finally { if (reader != null) reader.close(); } } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * //from w ww .j a v a2 s .co m * @param fnameStaging * @param mo * @param ID * @throws IOException * @throws DMLRuntimeException */ private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(mo.getFileName()); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); MatrixCharacteristics mc = mo.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit. // It works fine with int row, col but we require long for larger matrices. // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell) // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0) FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer long row = st.nextLong(); long col = st.nextLong(); double lvalue = Double.parseDouble(st.nextToken()); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /* w w w. j a v a2 s.c o m*/ * @param fnameStaging * @param mo * @param ID * @throws IOException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private void createBinaryCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(mo.getFileName()); FileSystem fs = FileSystem.get(job); LinkedList<Cell> buffer = new LinkedList<Cell>(); MatrixIndexes key = new MatrixIndexes(); MatrixCell value = new MatrixCell(); MatrixCharacteristics mc = mo.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { Cell tmp = new Cell(key.getRowIndex(), key.getColumnIndex(), value.getValue()); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /*from w w w . jav a 2s .co m*/ * @param fnameStaging * @param fnameStagingCompare * @param fnameNew * @param metadata * @param withCompare * @throws IOException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private void createBinaryBlockResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); MatrixCharacteristics mc = metadata.getMatrixCharacteristics(); long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class); //beware ca 50ms try { MatrixIndexes indexes = new MatrixIndexes(); for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) { File dir = new File(fnameStaging + "/" + brow + "_" + bcol); File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol); MatrixBlock mb = null; if (dir.exists()) { if (withCompare && dir2.exists()) //WITH COMPARE BLOCK { //copy only values that are different from the original String[] lnames2 = dir2.list(); if (lnames2.length != 1) //there should be exactly 1 compare block throw new DMLRuntimeException( "Unable to merge results because multiple compare blocks found."); mb = LocalFileUtils.readMatrixBlockFromLocal(dir2 + "/" + lnames2[0]); boolean appendOnly = mb.isInSparseFormat(); double[][] compare = DataConverter.convertToDoubleMatrix(mb); String[] lnames = dir.list(); for (String lname : lnames) { MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname); mergeWithComp(mb, tmp, compare); } //sort sparse due to append-only if (appendOnly) mb.sortSparseRows(); //change sparsity if required after mb.examSparsity(); } else //WITHOUT COMPARE BLOCK { //copy all non-zeros from all workers String[] lnames = dir.list(); boolean appendOnly = false; for (String lname : lnames) { if (mb == null) { mb = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname); appendOnly = mb.isInSparseFormat(); } else { MatrixBlock tmp = LocalFileUtils.readMatrixBlockFromLocal(dir + "/" + lname); mergeWithoutComp(mb, tmp, appendOnly); } } //sort sparse due to append-only if (appendOnly) mb.sortSparseRows(); //change sparsity if required after mb.examSparsity(); } } else { //NOTE: whenever runtime does not need all blocks anymore, this can be removed int maxRow = (int) (((brow - 1) * brlen + brlen < rlen) ? brlen : rlen - (brow - 1) * brlen); int maxCol = (int) (((bcol - 1) * bclen + bclen < clen) ? bclen : clen - (bcol - 1) * bclen); mb = new MatrixBlock(maxRow, maxCol, true); } //mb.examSparsity(); //done on write anyway and mb not reused indexes.setIndexes(brow, bcol); writer.append(indexes, mb); } } finally { if (writer != null) writer.close(); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /*from w ww .j a va 2 s . co m*/ * @param fnameStaging * @param fnameStagingCompare * @param fnameNew * @param metadata * @param withCompare * @throws IOException * @throws DMLRuntimeException */ private void createTextCellResultFile(String fnameStaging, String fnameStagingCompare, String fnameNew, MatrixFormatMetaData metadata, boolean withCompare) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); MatrixCharacteristics mc = metadata.getMatrixCharacteristics(); long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); try { //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); boolean written = false; for (long brow = 1; brow <= (long) Math.ceil(rlen / (double) brlen); brow++) for (long bcol = 1; bcol <= (long) Math.ceil(clen / (double) bclen); bcol++) { File dir = new File(fnameStaging + "/" + brow + "_" + bcol); File dir2 = new File(fnameStagingCompare + "/" + brow + "_" + bcol); MatrixBlock mb = null; long row_offset = (brow - 1) * brlen + 1; long col_offset = (bcol - 1) * bclen + 1; if (dir.exists()) { if (withCompare && dir2.exists()) //WITH COMPARE BLOCK { //copy only values that are different from the original String[] lnames2 = dir2.list(); if (lnames2.length != 1) //there should be exactly 1 compare block throw new DMLRuntimeException( "Unable to merge results because multiple compare blocks found."); mb = StagingFileUtils.readCellList2BlockFromLocal(dir2 + "/" + lnames2[0], brlen, bclen); boolean appendOnly = mb.isInSparseFormat(); double[][] compare = DataConverter.convertToDoubleMatrix(mb); String[] lnames = dir.list(); for (String lname : lnames) { MatrixBlock tmp = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen); mergeWithComp(mb, tmp, compare); } //sort sparse and exam sparsity due to append-only if (appendOnly) mb.sortSparseRows(); //change sparsity if required after mb.examSparsity(); } else //WITHOUT COMPARE BLOCK { //copy all non-zeros from all workers String[] lnames = dir.list(); boolean appendOnly = false; for (String lname : lnames) { if (mb == null) { mb = StagingFileUtils.readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen); appendOnly = mb.isInSparseFormat(); } else { MatrixBlock tmp = StagingFileUtils .readCellList2BlockFromLocal(dir + "/" + lname, brlen, bclen); mergeWithoutComp(mb, tmp, appendOnly); } } //sort sparse due to append-only if (appendOnly) mb.sortSparseRows(); //change sparsity if required after mb.examSparsity(); } } //write the block to text cell if (mb != null) { if (mb.isInSparseFormat()) { SparseRowsIterator iter = mb.getSparseRowsIterator(); while (iter.hasNext()) { IJV lcell = iter.next(); sb.append(row_offset + lcell.i); sb.append(' '); sb.append(col_offset + lcell.j); sb.append(' '); sb.append(lcell.v); sb.append('\n'); out.write(sb.toString()); sb.setLength(0); written = true; } } else { for (int i = 0; i < brlen; i++) for (int j = 0; j < bclen; j++) { double lvalue = mb.getValueDenseUnsafe(i, j); if (lvalue != 0) //for nnz { sb.append(row_offset + i); sb.append(' '); sb.append(col_offset + j); sb.append(' '); sb.append(lvalue); sb.append('\n'); out.write(sb.toString()); sb.setLength(0); written = true; } } } } } if (!written) out.write("1 1 0\n"); } finally { if (out != null) out.close(); } }