List of usage examples for org.apache.commons.math3.random Well1024a nextLong
public long nextLong()
From source file:com.ibm.bi.dml.runtime.matrix.data.LibMatrixDatagen.java
/** * /* w ww . ja va 2 s.com*/ * @param bigrand * @param nrb * @param ncb * @return */ private static long[] generateSeedsForCP(Well1024a bigrand, int nrb, int ncb) { int numBlocks = nrb * ncb; long[] seeds = new long[numBlocks]; for (int l = 0; l < numBlocks; l++) { // case of CP: generate a block-level seed from matrix-level Well1024a seed seeds[l] = bigrand.nextLong(); } return seeds; }
From source file:com.ibm.bi.dml.runtime.matrix.DataGenMR.java
/** * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p> * // w ww .ja va 2 s . c om * @param numRows number of rows for each random object * @param numCols number of columns for each random object * @param blockRowSize number of rows in a block for each random object * @param blockColSize number of columns in a block for each random object * @param minValue minimum of the random values for each random object * @param maxValue maximum of the random values for each random object * @param sparsity sparsity for each random object * @param pdf probability density function for each random object * @param replication file replication * @param inputs input file for each random object * @param outputs output file for each random object * @param outputInfos output information for each random object * @param instructionsInMapper instruction for each random object * @param resultIndexes result indexes for each random object * @return matrix characteristics for each random object * @throws Exception if an error occurred in the MapReduce phase */ public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(DataGenMR.class); job.setJobName("DataGen-MR"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, true); byte[] realIndexes = new byte[dataGenInstructions.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; String[] inputs = new String[dataGenInstructions.length]; InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length]; long[] rlens = new long[dataGenInstructions.length]; long[] clens = new long[dataGenInstructions.length]; int[] brlens = new int[dataGenInstructions.length]; int[] bclens = new int[dataGenInstructions.length]; FileSystem fs = FileSystem.get(job); String dataGenInsStr = ""; int numblocks = 0; int maxbrlen = -1, maxbclen = -1; double maxsparsity = -1; for (int i = 0; i < dataGenInstructions.length; i++) { dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i]; MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]); MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType(); DataGenMRInstruction genInst = (DataGenMRInstruction) mrins; rlens[i] = genInst.getRows(); clens[i] = genInst.getCols(); brlens[i] = genInst.getRowsInBlock(); bclens[i] = genInst.getColsInBlock(); maxbrlen = Math.max(maxbrlen, brlens[i]); maxbclen = Math.max(maxbclen, bclens[i]); if (mrtype == MRINSTRUCTION_TYPE.Rand) { RandInstruction randInst = (RandInstruction) mrins; inputs[i] = genInst.getBaseDir() + "tmp" + _seqRandInput.getNextID() + ".randinput"; maxsparsity = Math.max(maxsparsity, randInst.getSparsity()); FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); //seed generation Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed()); long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity()); int nnzIx = 0; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); for (long c = 0; c < clens[i]; c += bclens[i]) { long curBlockColSize = Math.min(bclens[i], (clens[i] - c)); sb.append((r / brlens[i]) + 1); sb.append(','); sb.append((c / bclens[i]) + 1); sb.append(','); sb.append(curBlockRowSize); sb.append(','); sb.append(curBlockColSize); sb.append(','); sb.append(nnz[nnzIx++]); sb.append(','); sb.append(bigrand.nextLong()); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else if (mrtype == MRINSTRUCTION_TYPE.Seq) { SeqInstruction seqInst = (SeqInstruction) mrins; inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput"; maxsparsity = 1.0; //always dense double from = seqInst.fromValue; double to = seqInst.toValue; double incr = seqInst.incrValue; // Correctness checks on (from, to, incr) boolean neg = (from > to); if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq()."); if (neg != (incr < 0)) throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()"); // Compute the number of rows in the sequence long numrows = 1 + (long) Math.floor((to - from) / incr); if (rlens[i] > 0) { if (numrows != rlens[i]) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows); } else { rlens[i] = numrows; } if (clens[i] > 0 && clens[i] != 1) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1."); else clens[i] = 1; FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); StringBuilder sb = new StringBuilder(); double temp = from; double block_from, block_to; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) long bid_i = ((r / brlens[i]) + 1); long bid_j = 1; block_from = temp; block_to = temp + (curBlockRowSize - 1) * incr; temp = block_to + incr; // next block starts from here sb.append(bid_i); sb.append(','); sb.append(bid_j); sb.append(','); /* // Need not include block size while generating seq() sb.append(curBlockRowSize); sb.append(','); sb.append(1); sb.append(',');*/ sb.append(block_from); sb.append(','); sb.append(block_to); sb.append(','); sb.append(incr); pw.println(sb.toString()); //System.out.println("MapTask " + r + ": " + sb.toString()); sb.setLength(0); numblocks++; } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else { throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype); } } dataGenInsStr = dataGenInsStr.substring(1);//remove the first "," RunningJob runjob; MatrixCharacteristics[] stats; try { //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the rand Instructions MRJobConfiguration.setRandInstructions(job, dataGenInsStr); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //determine degree of parallelism (nmappers: 1<=n<=capacity) //TODO use maxsparsity whenever we have a way of generating sparse rand data int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks(); long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize(); //correction max number of mappers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores()); int nmapers = Math .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1); job.setNumMapTasks(nmapers); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // print the complete MRJob instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg"); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable); // configure mapper and the mapper output key value pairs job.setMapperClass(DataGenMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); } //set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class); //configure reducer job.setReducerClass(GMRReducer.class); //job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); } finally { for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.instructions.spark.RandSPInstruction.java
/** * Helper function to construct a sample. * //w w w .j a v a2 s .co m * @param sec * @throws DMLRuntimeException */ private void generateSample(SparkExecutionContext sec) throws DMLRuntimeException { if (maxValue < rows && !replace) throw new DMLRuntimeException("Sample (size=" + rows + ") larger than population (size=" + maxValue + ") can only be generated with replacement."); if (LOG.isTraceEnabled()) LOG.trace("Process RandSPInstruction sample with range=" + maxValue + ", size=" + rows + ", replace=" + replace + ", seed=" + seed); // sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time. double fraction = SamplingUtils.computeFractionForSampleSize((int) rows, UtilFunctions.toLong(maxValue), replace); Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(seed); // divide the population range across numPartitions by creating SampleTasks double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize(); long outputSize = MatrixBlock.estimateSizeDenseInMemory(rows, 1); int numPartitions = (int) Math.ceil((double) outputSize / hdfsBlockSize); long partitionSize = (long) Math.ceil(maxValue / numPartitions); ArrayList<SampleTask> offsets = new ArrayList<SampleTask>(); long st = 1; while (st <= maxValue) { SampleTask s = new SampleTask(); s.range_start = st; s.seed = bigrand.nextLong(); offsets.add(s); st = st + partitionSize; } JavaRDD<SampleTask> offsetRDD = sec.getSparkContext().parallelize(offsets, numPartitions); // Construct the sample in a distributed manner JavaRDD<Double> rdd = offsetRDD .flatMap((new GenerateSampleBlock(replace, fraction, (long) maxValue, partitionSize))); // Randomize the sampled elements JavaRDD<Double> randomizedRDD = rdd.mapToPair(new AttachRandom()).sortByKey().values(); // Trim the sampled list to required size & attach matrix indexes to randomized elements JavaPairRDD<MatrixIndexes, MatrixCell> miRDD = randomizedRDD.zipWithIndex().filter(new TrimSample(rows)) .mapToPair(new Double2MatrixCell()); MatrixCharacteristics mcOut = new MatrixCharacteristics(rows, 1, rowsInBlock, colsInBlock, rows); // Construct BinaryBlock representation JavaPairRDD<MatrixIndexes, MatrixBlock> mbRDD = RDDConverterUtils .binaryCellToBinaryBlock(sec.getSparkContext(), miRDD, mcOut, true); MatrixCharacteristics retDims = sec.getMatrixCharacteristics(output.getName()); retDims.setNonZeros(rows); sec.setRDDHandleForVariable(output.getName(), mbRDD); }
From source file:com.ibm.bi.dml.runtime.instructions.spark.RandSPInstruction.java
@Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException { SparkExecutionContext sec = (SparkExecutionContext) ec; //process specific datagen operator if (this.method == DataGenMethod.RAND) { // The implementation is in same spirit as MapReduce // We generate seeds similar to com.ibm.bi.dml.runtime.matrix.DataGenMR // and then generate blocks similar to com.ibm.bi.dml.runtime.matrix.mapred.DataGenMapper //generate pseudo-random seed (because not specified) long lSeed = seed; //seed per invocation if (lSeed == DataGenOp.UNSPECIFIED_SEED) lSeed = DataGenOp.generateRandomSeed(); if (LOG.isTraceEnabled()) LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + "."); //Check if there is sufficient memory for matrix to be created and execution platform is not forced Spark if (isMemAvail(rows, cols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) { RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows, (int) cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams); MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed); sec.setMatrixOutput(output.getName(), mb); Statistics.decrementNoOfExecutedSPInst(); return; }/* www. j a va 2 s . c o m*/ // seed generation (partitioned to bound memory requirements) JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null; Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed); long[] nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity); double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize(); long numBlocks = nnz.length; long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock); for (long p = 0; p < numBlocks; p += SEED_PARTITION_SIZE) { ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>(); double partitionSize = 0; for (long i = p; i < Math.min(p + SEED_PARTITION_SIZE, numBlocks); i++) { long r = 1 + i / numColBlocks; long c = 1 + i % numColBlocks; MatrixIndexes indx = new MatrixIndexes(r, c); Long seedForBlock = bigrand.nextLong(); seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx, new Tuple2<Long, Long>(seedForBlock, nnz[(int) i]))); partitionSize += nnz[(int) i] * 8 + 16; } //for load balancing: degree of parallelism such that ~128MB per partition int numPartitions = (int) Math.max(Math.min(partitionSize / hdfsBlockSize, seeds.size()), 1); //combine seeds partitions to seed rdd JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD2 = JavaPairRDD .fromJavaRDD(sec.getSparkContext().parallelize(seeds, numPartitions)); seedsRDD = (seedsRDD != null) ? seedsRDD.union(seedsRDD2) : seedsRDD2; } //execute rand instruction over seed input JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams)); //output handling MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); if (!mcOut.dimsKnown(true)) { //note: we cannot compute the nnz from sparsity because this would not reflect the //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1. long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1; mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz); } sec.setRDDHandleForVariable(output.getName(), out); } else if (this.method == DataGenMethod.SEQ) { //sanity check valid increment if (seq_incr == 0) { throw new DMLRuntimeException( "ERROR: While performing seq(" + seq_from + "," + seq_to + "," + seq_incr + ")"); } if (LOG.isTraceEnabled()) LOG.trace("Process RandSPInstruction seq with seqFrom=" + seq_from + ", seqTo=" + seq_to + ", seqIncr" + seq_incr); // offset generation (partitioned to bound memory requirements) JavaRDD<Double> offsetsRDD = null; double hdfsBlockSize = InfrastructureAnalyzer.getHDFSBlockSize(); long nnz = (long) Math.abs(Math.round((seq_to - seq_from) / seq_incr)) + 1; long numBlocks = (long) Math.ceil(((double) nnz) / rowsInBlock); for (long p = 0; p < numBlocks; p += SEED_PARTITION_SIZE) { ArrayList<Double> offsets = new ArrayList<Double>(); double partitionSize = 0; for (long i = p; i < Math.min(p + SEED_PARTITION_SIZE, numBlocks); i++) { double off = seq_from + seq_incr * i * rowsInBlock; offsets.add(off); partitionSize += rowsInBlock * 8 + 16; } //for load balancing: degree of parallelism such that ~128MB per partition int numPartitions = (int) Math.max(Math.min(partitionSize / hdfsBlockSize, offsets.size()), 1); //combine seeds partitions to seed rdd JavaRDD<Double> offsetsRDD2 = sec.getSparkContext().parallelize(offsets, numPartitions); offsetsRDD = (offsetsRDD != null) ? offsetsRDD.union(offsetsRDD2) : offsetsRDD2; } //sanity check number of non-zeros if (nnz != rows && rows != -1) { throw new DMLRuntimeException("Incorrect number of non-zeros: " + nnz + " != " + rows); } //execute seq instruction over offset input JavaPairRDD<MatrixIndexes, MatrixBlock> out = offsetsRDD .mapToPair(new GenerateSequenceBlock(rowsInBlock, seq_from, seq_to, seq_incr)); //output handling MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); if (!mcOut.dimsKnown()) { mcOut.set(nnz, 1, rowsInBlock, colsInBlock, nnz); } sec.setRDDHandleForVariable(output.getName(), out); } else if (this.method == DataGenMethod.SAMPLE) { generateSample(sec); } }
From source file:org.apache.sysml.runtime.instructions.spark.RandSPInstruction.java
private void generateRandData(SparkExecutionContext sec) throws DMLRuntimeException { //step 1: generate pseudo-random seed (because not specified) long lSeed = seed; //seed per invocation if (lSeed == DataGenOp.UNSPECIFIED_SEED) lSeed = DataGenOp.generateRandomSeed(); if (LOG.isTraceEnabled()) LOG.trace("Process RandSPInstruction rand with seed = " + lSeed + "."); //step 2: potential in-memory rand operations if applicable if (isMemAvail(rows, cols, sparsity, minValue, maxValue) && DMLScript.rtplatform != RUNTIME_PLATFORM.SPARK) { RandomMatrixGenerator rgen = LibMatrixDatagen.createRandomMatrixGenerator(pdf, (int) rows, (int) cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdfParams); MatrixBlock mb = MatrixBlock.randOperations(rgen, lSeed); sec.setMatrixOutput(output.getName(), mb); Statistics.decrementNoOfExecutedSPInst(); return;/* ww w .java 2s. c o m*/ } //step 3: seed generation JavaPairRDD<MatrixIndexes, Tuple2<Long, Long>> seedsRDD = null; Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(lSeed); long[] nnz = LibMatrixDatagen.computeNNZperBlock(rows, cols, rowsInBlock, colsInBlock, sparsity); double totalSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(rows, cols, rowsInBlock, colsInBlock, rows * cols * sparsity); //overestimate for on disk, ensures hdfs block per partition double hdfsBlkSize = InfrastructureAnalyzer.getHDFSBlockSize(); long numBlocks = nnz.length; long numColBlocks = (long) Math.ceil((double) cols / (double) colsInBlock); //a) in-memory seed rdd construction if (numBlocks < INMEMORY_NUMBLOCKS_THRESHOLD) { ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>> seeds = new ArrayList<Tuple2<MatrixIndexes, Tuple2<Long, Long>>>(); for (long i = 0; i < numBlocks; i++) { long r = 1 + i / numColBlocks; long c = 1 + i % numColBlocks; MatrixIndexes indx = new MatrixIndexes(r, c); Long seedForBlock = bigrand.nextLong(); seeds.add(new Tuple2<MatrixIndexes, Tuple2<Long, Long>>(indx, new Tuple2<Long, Long>(seedForBlock, nnz[(int) i]))); } //for load balancing: degree of parallelism such that ~128MB per partition int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1); //create seeds rdd seedsRDD = sec.getSparkContext().parallelizePairs(seeds, numPartitions); } //b) file-based seed rdd construction (for robustness wrt large number of blocks) else { String path = LibMatrixDatagen.generateUniqueSeedPath(dir); try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); FSDataOutputStream fsOut = fs.create(new Path(path)); PrintWriter pw = new PrintWriter(fsOut); StringBuilder sb = new StringBuilder(); for (long i = 0; i < numBlocks; i++) { sb.append(1 + i / numColBlocks); sb.append(','); sb.append(1 + i % numColBlocks); sb.append(','); sb.append(bigrand.nextLong()); sb.append(','); sb.append(nnz[(int) i]); pw.println(sb.toString()); sb.setLength(0); } pw.close(); fsOut.close(); } catch (IOException ex) { throw new DMLRuntimeException(ex); } //for load balancing: degree of parallelism such that ~128MB per partition int numPartitions = (int) Math.max(Math.min(totalSize / hdfsBlkSize, numBlocks), 1); //create seeds rdd seedsRDD = sec.getSparkContext().textFile(path, numPartitions).mapToPair(new ExtractSeedTuple()); } //step 4: execute rand instruction over seed input JavaPairRDD<MatrixIndexes, MatrixBlock> out = seedsRDD.mapToPair(new GenerateRandomBlock(rows, cols, rowsInBlock, colsInBlock, sparsity, minValue, maxValue, pdf, pdfParams)); //step 5: output handling MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); if (!mcOut.dimsKnown(true)) { //note: we cannot compute the nnz from sparsity because this would not reflect the //actual number of non-zeros, except for extreme values of sparsity equals 0 or 1. long lnnz = (sparsity == 0 || sparsity == 1) ? (long) (sparsity * rows * cols) : -1; mcOut.set(rows, cols, rowsInBlock, colsInBlock, lnnz); } sec.setRDDHandleForVariable(output.getName(), out); }
From source file:org.apache.sysml.runtime.matrix.data.LibMatrixDatagen.java
private static long[] generateSeedsForCP(Well1024a bigrand, int nrb, int ncb) { int numBlocks = nrb * ncb; long[] seeds = new long[numBlocks]; for (int l = 0; l < numBlocks; l++) seeds[l] = bigrand.nextLong(); return seeds; }
From source file:org.apache.sysml.runtime.matrix.DataGenMR.java
/** * <p>Starts a Rand MapReduce job which will produce one or more random objects.</p> * /*from www .j a va 2s .c o m*/ * @param inst MR job instruction * @param dataGenInstructions array of data gen instructions * @param instructionsInMapper instructions in mapper * @param aggInstructionsInReducer aggregate instructions in reducer * @param otherInstructionsInReducer other instructions in reducer * @param numReducers number of reducers * @param replication file replication * @param resultIndexes result indexes for each random object * @param dimsUnknownFilePrefix file path prefix when dimensions unknown * @param outputs output file for each random object * @param outputInfos output information for each random object * @return matrix characteristics for each random object * @throws Exception if Exception occurs */ public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception { JobConf job = new JobConf(DataGenMR.class); job.setJobName("DataGen-MR"); //whether use block representation or cell representation MRJobConfiguration.setMatrixValueClass(job, true); byte[] realIndexes = new byte[dataGenInstructions.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; String[] inputs = new String[dataGenInstructions.length]; InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length]; long[] rlens = new long[dataGenInstructions.length]; long[] clens = new long[dataGenInstructions.length]; int[] brlens = new int[dataGenInstructions.length]; int[] bclens = new int[dataGenInstructions.length]; FileSystem fs = FileSystem.get(job); String dataGenInsStr = ""; int numblocks = 0; int maxbrlen = -1, maxbclen = -1; double maxsparsity = -1; for (int i = 0; i < dataGenInstructions.length; i++) { dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i]; MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]); MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType(); DataGenMRInstruction genInst = (DataGenMRInstruction) mrins; rlens[i] = genInst.getRows(); clens[i] = genInst.getCols(); brlens[i] = genInst.getRowsInBlock(); bclens[i] = genInst.getColsInBlock(); maxbrlen = Math.max(maxbrlen, brlens[i]); maxbclen = Math.max(maxbclen, bclens[i]); if (mrtype == MRINSTRUCTION_TYPE.Rand) { RandInstruction randInst = (RandInstruction) mrins; inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir()); maxsparsity = Math.max(maxsparsity, randInst.getSparsity()); FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); //seed generation Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed()); long[] nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity()); int nnzIx = 0; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); for (long c = 0; c < clens[i]; c += bclens[i]) { long curBlockColSize = Math.min(bclens[i], (clens[i] - c)); sb.append((r / brlens[i]) + 1); sb.append(','); sb.append((c / bclens[i]) + 1); sb.append(','); sb.append(curBlockRowSize); sb.append(','); sb.append(curBlockColSize); sb.append(','); sb.append(nnz[nnzIx++]); sb.append(','); sb.append(bigrand.nextLong()); pw.println(sb.toString()); sb.setLength(0); numblocks++; } } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else if (mrtype == MRINSTRUCTION_TYPE.Seq) { SeqInstruction seqInst = (SeqInstruction) mrins; inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput"; maxsparsity = 1.0; //always dense double from = seqInst.fromValue; double to = seqInst.toValue; double incr = seqInst.incrValue; //handle default 1 to -1 for special case of from>to incr = LibMatrixDatagen.updateSeqIncr(from, to, incr); // Correctness checks on (from, to, incr) boolean neg = (from > to); if (incr == 0) throw new DMLRuntimeException("Invalid value for \"increment\" in seq()."); if (neg != (incr < 0)) throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()"); // Compute the number of rows in the sequence long numrows = 1 + (long) Math.floor((to - from) / incr); if (rlens[i] > 0) { if (numrows != rlens[i]) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows); } else { rlens[i] = numrows; } if (clens[i] > 0 && clens[i] != 1) throw new DMLRuntimeException( "Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1."); else clens[i] = 1; FSDataOutputStream fsOut = fs.create(new Path(inputs[i])); PrintWriter pw = new PrintWriter(fsOut); StringBuilder sb = new StringBuilder(); double temp = from; double block_from, block_to; for (long r = 0; r < rlens[i]; r += brlens[i]) { long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r)); // block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval) long bid_i = ((r / brlens[i]) + 1); long bid_j = 1; block_from = temp; block_to = temp + (curBlockRowSize - 1) * incr; temp = block_to + incr; // next block starts from here sb.append(bid_i); sb.append(','); sb.append(bid_j); sb.append(','); /* // Need not include block size while generating seq() sb.append(curBlockRowSize); sb.append(','); sb.append(1); sb.append(',');*/ sb.append(block_from); sb.append(','); sb.append(block_to); sb.append(','); sb.append(incr); pw.println(sb.toString()); //System.out.println("MapTask " + r + ": " + sb.toString()); sb.setLength(0); numblocks++; } pw.close(); fsOut.close(); inputInfos[i] = InputInfo.TextCellInputInfo; } else { throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype); } } dataGenInsStr = dataGenInsStr.substring(1);//remove the first "," RunningJob runjob; MatrixCharacteristics[] stats; try { //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the rand Instructions MRJobConfiguration.setRandInstructions(job, dataGenInsStr); //set up unary instructions that will perform in the mapper MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getDMLConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set up custom map/reduce configurations MRJobConfiguration.setupCustomMRConfigurations(job, config); //determine degree of parallelism (nmappers: 1<=n<=capacity) //TODO use maxsparsity whenever we have a way of generating sparse rand data int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks(); long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize(); //correction max number of mappers on yarn clusters if (InfrastructureAnalyzer.isYarnEnabled()) capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores()); int nmapers = Math .max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1); job.setNumMapTasks(nmapers); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); stats = ret.stats; //set up the number of reducers MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers); // print the complete MRJob instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg"); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable); // configure mapper and the mapper output key value pairs job.setMapperClass(DataGenMapper.class); if (numReducers == 0) { job.setMapOutputKeyClass(Writable.class); job.setMapOutputValueClass(Writable.class); } else { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); } //set up combiner if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty()) job.setCombinerClass(GMRCombiner.class); //configure reducer job.setReducerClass(GMRReducer.class); //job.setReducerClass(PassThroughReducer.class); // By default, the job executes in "cluster" mode. // Determine if we can optimize and run it in "local" mode. MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length]; for (int i = 0; i < inputs.length; i++) { inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]); } //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); runjob = JobClient.runJob(job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile"; stats = MapReduceTool.processDimsFiles(dir, stats); MapReduceTool.deleteFileIfExistOnHDFS(dir); } finally { for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }